aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@zytor.com>2010-04-29 19:53:17 -0400
committerH. Peter Anvin <hpa@zytor.com>2010-04-29 19:53:17 -0400
commitd9c5841e22231e4e49fd0a1004164e6fce59b7a6 (patch)
treee1f589c46b3ff79bbe7b1b2469f6362f94576da6 /fs
parentb701a47ba48b698976fb2fe05fb285b0edc1d26a (diff)
parent5967ed87ade85a421ef814296c3c7f182b08c225 (diff)
Merge branch 'x86/asm' into x86/atomic
Merge reason: Conflict between LOCK_PREFIX_HERE and relative alternatives pointers Resolved Conflicts: arch/x86/include/asm/alternative.h arch/x86/kernel/alternative.c Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c1
-rw-r--r--fs/9p/fid.c15
-rw-r--r--fs/9p/v9fs.c67
-rw-r--r--fs/9p/v9fs.h26
-rw-r--r--fs/9p/v9fs_vfs.h1
-rw-r--r--fs/9p/vfs_dentry.c1
-rw-r--r--fs/9p/vfs_dir.c16
-rw-r--r--fs/9p/vfs_file.c27
-rw-r--r--fs/9p/vfs_inode.c101
-rw-r--r--fs/9p/vfs_super.c8
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/adfs/adfs.h2
-rw-r--r--fs/adfs/inode.c5
-rw-r--r--fs/adfs/super.c1
-rw-r--r--fs/affs/affs.h5
-rw-r--r--fs/affs/bitmap.c3
-rw-r--r--fs/affs/inode.c3
-rw-r--r--fs/affs/namei.c7
-rw-r--r--fs/affs/super.c32
-rw-r--r--fs/affs/symlink.c7
-rw-r--r--fs/afs/cache.c1
-rw-r--r--fs/afs/cmservice.c1
-rw-r--r--fs/afs/dir.c1
-rw-r--r--fs/afs/file.c2
-rw-r--r--fs/afs/fsclient.c1
-rw-r--r--fs/afs/inode.c1
-rw-r--r--fs/afs/internal.h3
-rw-r--r--fs/afs/mntpt.c26
-rw-r--r--fs/afs/rxrpc.c1
-rw-r--r--fs/afs/security.c5
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/afs/vlclient.c1
-rw-r--r--fs/afs/vlocation.c1
-rw-r--r--fs/afs/vnode.c1
-rw-r--r--fs/afs/volume.c7
-rw-r--r--fs/afs/write.c21
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/attr.c13
-rw-r--r--fs/autofs/root.c1
-rw-r--r--fs/autofs4/autofs_i.h7
-rw-r--r--fs/autofs4/dev-ioctl.c12
-rw-r--r--fs/autofs4/expire.c6
-rw-r--r--fs/autofs4/inode.c63
-rw-r--r--fs/autofs4/root.c475
-rw-r--r--fs/befs/datastream.c1
-rw-r--r--fs/befs/linuxvfs.c1
-rw-r--r--fs/bfs/inode.c48
-rw-r--r--fs/binfmt_aout.c55
-rw-r--r--fs/binfmt_elf.c178
-rw-r--r--fs/binfmt_elf_fdpic.c210
-rw-r--r--fs/binfmt_em86.c1
-rw-r--r--fs/binfmt_flat.c5
-rw-r--r--fs/binfmt_script.c1
-rw-r--r--fs/binfmt_som.c1
-rw-r--r--fs/bio-integrity.c4
-rw-r--r--fs/bio.c29
-rw-r--r--fs/block_dev.c29
-rw-r--r--fs/btrfs/acl.c14
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/btrfs_inode.h5
-rw-r--r--fs/btrfs/compression.c23
-rw-r--r--fs/btrfs/ctree.c5
-rw-r--r--fs/btrfs/ctree.h18
-rw-r--r--fs/btrfs/delayed-ref.c1
-rw-r--r--fs/btrfs/disk-io.c57
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c95
-rw-r--r--fs/btrfs/extent_io.c102
-rw-r--r--fs/btrfs/extent_io.h10
-rw-r--r--fs/btrfs/extent_map.c17
-rw-r--r--fs/btrfs/file-item.c1
-rw-r--r--fs/btrfs/file.c128
-rw-r--r--fs/btrfs/free-space-cache.c5
-rw-r--r--fs/btrfs/inode.c271
-rw-r--r--fs/btrfs/ioctl.c710
-rw-r--r--fs/btrfs/ioctl.h111
-rw-r--r--fs/btrfs/locking.c1
-rw-r--r--fs/btrfs/ordered-data.c50
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/ref-cache.c1
-rw-r--r--fs/btrfs/ref-cache.h2
-rw-r--r--fs/btrfs/relocation.c16
-rw-r--r--fs/btrfs/super.c264
-rw-r--r--fs/btrfs/sysfs.c4
-rw-r--r--fs/btrfs/transaction.c120
-rw-r--r--fs/btrfs/tree-log.c3
-rw-r--r--fs/btrfs/volumes.c73
-rw-r--r--fs/buffer.c15
-rw-r--r--fs/cachefiles/interface.c1
-rw-r--r--fs/cachefiles/namei.c13
-rw-r--r--fs/cachefiles/rdwr.c1
-rw-r--r--fs/cachefiles/xattr.c1
-rw-r--r--fs/ceph/Kconfig27
-rw-r--r--fs/ceph/Makefile39
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c1193
-rw-r--r--fs/ceph/armor.c99
-rw-r--r--fs/ceph/auth.c258
-rw-r--r--fs/ceph/auth.h84
-rw-r--r--fs/ceph/auth_none.c122
-rw-r--r--fs/ceph/auth_none.h28
-rw-r--r--fs/ceph/auth_x.c680
-rw-r--r--fs/ceph/auth_x.h49
-rw-r--r--fs/ceph/auth_x_protocol.h90
-rw-r--r--fs/ceph/buffer.c81
-rw-r--r--fs/ceph/buffer.h39
-rw-r--r--fs/ceph/caps.c2955
-rw-r--r--fs/ceph/ceph_debug.h37
-rw-r--r--fs/ceph/ceph_frag.c21
-rw-r--r--fs/ceph/ceph_frag.h109
-rw-r--r--fs/ceph/ceph_fs.c74
-rw-r--r--fs/ceph/ceph_fs.h650
-rw-r--r--fs/ceph/ceph_hash.c118
-rw-r--r--fs/ceph/ceph_hash.h13
-rw-r--r--fs/ceph/ceph_strings.c176
-rw-r--r--fs/ceph/crush/crush.c151
-rw-r--r--fs/ceph/crush/crush.h180
-rw-r--r--fs/ceph/crush/hash.c149
-rw-r--r--fs/ceph/crush/hash.h17
-rw-r--r--fs/ceph/crush/mapper.c596
-rw-r--r--fs/ceph/crush/mapper.h20
-rw-r--r--fs/ceph/crypto.c409
-rw-r--r--fs/ceph/crypto.h48
-rw-r--r--fs/ceph/debugfs.c484
-rw-r--r--fs/ceph/decode.h194
-rw-r--r--fs/ceph/dir.c1224
-rw-r--r--fs/ceph/export.c224
-rw-r--r--fs/ceph/file.c938
-rw-r--r--fs/ceph/inode.c1774
-rw-r--r--fs/ceph/ioctl.c160
-rw-r--r--fs/ceph/ioctl.h40
-rw-r--r--fs/ceph/mds_client.c3043
-rw-r--r--fs/ceph/mds_client.h335
-rw-r--r--fs/ceph/mdsmap.c174
-rw-r--r--fs/ceph/mdsmap.h54
-rw-r--r--fs/ceph/messenger.c2249
-rw-r--r--fs/ceph/messenger.h255
-rw-r--r--fs/ceph/mon_client.c835
-rw-r--r--fs/ceph/mon_client.h119
-rw-r--r--fs/ceph/msgpool.c186
-rw-r--r--fs/ceph/msgpool.h27
-rw-r--r--fs/ceph/msgr.h158
-rw-r--r--fs/ceph/osd_client.c1550
-rw-r--r--fs/ceph/osd_client.h166
-rw-r--r--fs/ceph/osdmap.c1062
-rw-r--r--fs/ceph/osdmap.h126
-rw-r--r--fs/ceph/pagelist.c55
-rw-r--r--fs/ceph/pagelist.h54
-rw-r--r--fs/ceph/rados.h376
-rw-r--r--fs/ceph/snap.c907
-rw-r--r--fs/ceph/super.c1031
-rw-r--r--fs/ceph/super.h901
-rw-r--r--fs/ceph/types.h29
-rw-r--r--fs/ceph/xattr.c845
-rw-r--r--fs/cifs/CHANGES9
-rw-r--r--fs/cifs/asn1.c2
-rw-r--r--fs/cifs/cifs_dfs_ref.c6
-rw-r--r--fs/cifs/cifs_fs_sb.h3
-rw-r--r--fs/cifs/cifs_spnego.c1
-rw-r--r--fs/cifs/cifs_unicode.c1
-rw-r--r--fs/cifs/cifsacl.c1
-rw-r--r--fs/cifs/cifsencrypt.c1
-rw-r--r--fs/cifs/cifsfs.c17
-rw-r--r--fs/cifs/cifsfs.h5
-rw-r--r--fs/cifs/cifsglob.h5
-rw-r--r--fs/cifs/cifspdu.h6
-rw-r--r--fs/cifs/cifsproto.h13
-rw-r--r--fs/cifs/cifssmb.c532
-rw-r--r--fs/cifs/connect.c52
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/dns_resolve.c1
-rw-r--r--fs/cifs/file.c40
-rw-r--r--fs/cifs/inode.c313
-rw-r--r--fs/cifs/link.c1
-rw-r--r--fs/cifs/misc.c2
-rw-r--r--fs/cifs/readdir.c9
-rw-r--r--fs/cifs/sess.c12
-rw-r--r--fs/cifs/smbencrypt.c1
-rw-r--r--fs/cifs/transport.c1
-rw-r--r--fs/cifs/xattr.c9
-rw-r--r--fs/coda/dir.c1
-rw-r--r--fs/coda/file.c1
-rw-r--r--fs/coda/inode.c9
-rw-r--r--fs/coda/upcall.c1
-rw-r--r--fs/compat.c19
-rw-r--r--fs/compat_binfmt_elf.c2
-rw-r--r--fs/compat_ioctl.c18
-rw-r--r--fs/configfs/inode.c1
-rw-r--r--fs/configfs/mount.c1
-rw-r--r--fs/configfs/symlink.c5
-rw-r--r--fs/dcache.c70
-rw-r--r--fs/debugfs/inode.c14
-rw-r--r--fs/devpts/inode.c1
-rw-r--r--fs/dlm/ast.c74
-rw-r--r--fs/dlm/ast.h4
-rw-r--r--fs/dlm/config.c1
-rw-r--r--fs/dlm/debug_fs.c3
-rw-r--r--fs/dlm/dlm_internal.h10
-rw-r--r--fs/dlm/lock.c121
-rw-r--r--fs/dlm/lockspace.c16
-rw-r--r--fs/dlm/lowcomms.c1
-rw-r--r--fs/dlm/member.c2
-rw-r--r--fs/dlm/netlink.c1
-rw-r--r--fs/dlm/plock.c1
-rw-r--r--fs/dlm/user.c11
-rw-r--r--fs/dlm/user.h4
-rw-r--r--fs/ecryptfs/crypto.c42
-rw-r--r--fs/ecryptfs/dentry.c1
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h15
-rw-r--r--fs/ecryptfs/file.c18
-rw-r--r--fs/ecryptfs/inode.c288
-rw-r--r--fs/ecryptfs/keystore.c1
-rw-r--r--fs/ecryptfs/kthread.c1
-rw-r--r--fs/ecryptfs/main.c15
-rw-r--r--fs/ecryptfs/messaging.c1
-rw-r--r--fs/ecryptfs/miscdev.c1
-rw-r--r--fs/ecryptfs/mmap.c39
-rw-r--r--fs/ecryptfs/super.c3
-rw-r--r--fs/eventfd.c90
-rw-r--r--fs/exec.c104
-rw-r--r--fs/exofs/common.h39
-rw-r--r--fs/exofs/exofs.h57
-rw-r--r--fs/exofs/inode.c216
-rw-r--r--fs/exofs/ios.c576
-rw-r--r--fs/exofs/pnfs.h10
-rw-r--r--fs/exofs/super.c130
-rw-r--r--fs/ext2/balloc.c13
-rw-r--r--fs/ext2/ext2.h2
-rw-r--r--fs/ext2/file.c5
-rw-r--r--fs/ext2/ialloc.c14
-rw-r--r--fs/ext2/inode.c18
-rw-r--r--fs/ext2/namei.c51
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext2/symlink.c2
-rw-r--r--fs/ext2/xattr.c10
-rw-r--r--fs/ext2/xattr_security.c1
-rw-r--r--fs/ext3/balloc.c12
-rw-r--r--fs/ext3/file.c7
-rw-r--r--fs/ext3/ialloc.c20
-rw-r--r--fs/ext3/inode.c47
-rw-r--r--fs/ext3/namei.c24
-rw-r--r--fs/ext3/super.c248
-rw-r--r--fs/ext3/symlink.c2
-rw-r--r--fs/ext3/xattr.c22
-rw-r--r--fs/ext3/xattr_security.c1
-rw-r--r--fs/ext4/Kconfig1
-rw-r--r--fs/ext4/balloc.c35
-rw-r--r--fs/ext4/block_validity.c6
-rw-r--r--fs/ext4/dir.c14
-rw-r--r--fs/ext4/ext4.h119
-rw-r--r--fs/ext4/ext4_extents.h3
-rw-r--r--fs/ext4/ext4_jbd2.c4
-rw-r--r--fs/ext4/ext4_jbd2.h24
-rw-r--r--fs/ext4/extents.c351
-rw-r--r--fs/ext4/file.c13
-rw-r--r--fs/ext4/fsync.c18
-rw-r--r--fs/ext4/ialloc.c52
-rw-r--r--fs/ext4/inode.c759
-rw-r--r--fs/ext4/ioctl.c12
-rw-r--r--fs/ext4/mballoc.c99
-rw-r--r--fs/ext4/mballoc.h10
-rw-r--r--fs/ext4/migrate.c36
-rw-r--r--fs/ext4/move_extent.c37
-rw-r--r--fs/ext4/namei.c86
-rw-r--r--fs/ext4/resize.c102
-rw-r--r--fs/ext4/super.c397
-rw-r--r--fs/ext4/xattr.c66
-rw-r--r--fs/ext4/xattr_security.c1
-rw-r--r--fs/fat/cache.c1
-rw-r--r--fs/fat/inode.c11
-rw-r--r--fs/fat/namei_vfat.c33
-rw-r--r--fs/fcntl.c104
-rw-r--r--fs/fifo.c1
-rw-r--r--fs/file.c4
-rw-r--r--fs/file_table.c3
-rw-r--r--fs/filesystems.c2
-rw-r--r--fs/freevxfs/vxfs_subr.c1
-rw-r--r--fs/fs-writeback.c174
-rw-r--r--fs/fscache/Kconfig1
-rw-r--r--fs/fscache/object-list.c1
-rw-r--r--fs/fscache/object.c6
-rw-r--r--fs/fscache/operation.c5
-rw-r--r--fs/fscache/page.c2
-rw-r--r--fs/fscache/stats.c4
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/fuse/dev.c30
-rw-r--r--fs/fuse/file.c3
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/generic_acl.c1
-rw-r--r--fs/gfs2/Kconfig1
-rw-r--r--fs/gfs2/aops.c4
-rw-r--r--fs/gfs2/bmap.c3
-rw-r--r--fs/gfs2/dentry.c1
-rw-r--r--fs/gfs2/export.c1
-rw-r--r--fs/gfs2/file.c40
-rw-r--r--fs/gfs2/glock.c79
-rw-r--r--fs/gfs2/glock.h9
-rw-r--r--fs/gfs2/glops.c17
-rw-r--r--fs/gfs2/incore.h9
-rw-r--r--fs/gfs2/inode.c6
-rw-r--r--fs/gfs2/lock_dlm.c17
-rw-r--r--fs/gfs2/log.c3
-rw-r--r--fs/gfs2/lops.c4
-rw-r--r--fs/gfs2/main.c28
-rw-r--r--fs/gfs2/meta_io.c46
-rw-r--r--fs/gfs2/meta_io.h12
-rw-r--r--fs/gfs2/ops_fstype.c20
-rw-r--r--fs/gfs2/ops_inode.c118
-rw-r--r--fs/gfs2/quota.c9
-rw-r--r--fs/gfs2/quota.h2
-rw-r--r--fs/gfs2/rgrp.c8
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/super.c35
-rw-r--r--fs/gfs2/sys.c9
-rw-r--r--fs/gfs2/util.c2
-rw-r--r--fs/gfs2/util.h1
-rw-r--r--fs/gfs2/xattr.c21
-rw-r--r--fs/hfs/bnode.c1
-rw-r--r--fs/hfs/btree.c1
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfs/mdb.c1
-rw-r--r--fs/hfs/super.c1
-rw-r--r--fs/hfsplus/options.c1
-rw-r--r--fs/hfsplus/super.c3
-rw-r--r--fs/hostfs/hostfs_kern.c1
-rw-r--r--fs/hpfs/anode.c2
-rw-r--r--fs/hpfs/buffer.c1
-rw-r--r--fs/hpfs/dentry.c14
-rw-r--r--fs/hpfs/dir.c15
-rw-r--r--fs/hpfs/dnode.c21
-rw-r--r--fs/hpfs/ea.c7
-rw-r--r--fs/hpfs/hpfs_fn.h30
-rw-r--r--fs/hpfs/inode.c5
-rw-r--r--fs/hpfs/map.c6
-rw-r--r--fs/hpfs/name.c21
-rw-r--r--fs/hpfs/namei.c75
-rw-r--r--fs/hpfs/super.c1
-rw-r--r--fs/hppfs/hppfs.c20
-rw-r--r--fs/inode.c4
-rw-r--r--fs/internal.h2
-rw-r--r--fs/ioctl.c92
-rw-r--r--fs/ioprio.c1
-rw-r--r--fs/isofs/dir.c1
-rw-r--r--fs/isofs/namei.c1
-rw-r--r--fs/jbd/commit.c11
-rw-r--r--fs/jbd/recovery.c1
-rw-r--r--fs/jbd/transaction.c45
-rw-r--r--fs/jbd2/checkpoint.c16
-rw-r--r--fs/jbd2/commit.c32
-rw-r--r--fs/jbd2/journal.c134
-rw-r--r--fs/jbd2/recovery.c1
-rw-r--r--fs/jbd2/transaction.c43
-rw-r--r--fs/jffs2/compr_lzo.c1
-rw-r--r--fs/jffs2/compr_zlib.c1
-rw-r--r--fs/jffs2/debug.c1
-rw-r--r--fs/jffs2/file.c1
-rw-r--r--fs/jffs2/nodelist.c1
-rw-r--r--fs/jffs2/nodemgmt.c1
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/jffs2/symlink.c1
-rw-r--r--fs/jffs2/write.c1
-rw-r--r--fs/jfs/acl.c27
-rw-r--r--fs/jfs/file.c31
-rw-r--r--fs/jfs/inode.c16
-rw-r--r--fs/jfs/jfs_acl.h7
-rw-r--r--fs/jfs/jfs_dmap.c17
-rw-r--r--fs/jfs/jfs_dmap.h6
-rw-r--r--fs/jfs/jfs_dtree.c29
-rw-r--r--fs/jfs/jfs_extent.c16
-rw-r--r--fs/jfs/jfs_imap.c1
-rw-r--r--fs/jfs/jfs_inode.c8
-rw-r--r--fs/jfs/jfs_inode.h4
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/jfs/jfs_metapage.c1
-rw-r--r--fs/jfs/jfs_unicode.h1
-rw-r--r--fs/jfs/jfs_xtree.c21
-rw-r--r--fs/jfs/namei.c27
-rw-r--r--fs/jfs/resize.c6
-rw-r--r--fs/jfs/super.c7
-rw-r--r--fs/jfs/symlink.c14
-rw-r--r--fs/jfs/xattr.c18
-rw-r--r--fs/libfs.c78
-rw-r--r--fs/lockd/clntlock.c1
-rw-r--r--fs/lockd/clntproc.c1
-rw-r--r--fs/lockd/host.c2
-rw-r--r--fs/lockd/mon.c13
-rw-r--r--fs/lockd/svc.c3
-rw-r--r--fs/lockd/svc4proc.c1
-rw-r--r--fs/lockd/svclock.c1
-rw-r--r--fs/lockd/svcproc.c1
-rw-r--r--fs/lockd/svcsubs.c1
-rw-r--r--fs/locks.c7
-rw-r--r--fs/logfs/Kconfig17
-rw-r--r--fs/logfs/Makefile13
-rw-r--r--fs/logfs/compr.c95
-rw-r--r--fs/logfs/dev_bdev.c333
-rw-r--r--fs/logfs/dev_mtd.c254
-rw-r--r--fs/logfs/dir.c827
-rw-r--r--fs/logfs/file.c263
-rw-r--r--fs/logfs/gc.c739
-rw-r--r--fs/logfs/inode.c418
-rw-r--r--fs/logfs/journal.c898
-rw-r--r--fs/logfs/logfs.h736
-rw-r--r--fs/logfs/logfs_abi.h629
-rw-r--r--fs/logfs/readwrite.c2267
-rw-r--r--fs/logfs/segment.c930
-rw-r--r--fs/logfs/super.c657
-rw-r--r--fs/minix/inode.c8
-rw-r--r--fs/minix/itree_v1.c1
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/namei.c597
-rw-r--r--fs/namespace.c63
-rw-r--r--fs/ncpfs/dir.c1
-rw-r--r--fs/ncpfs/file.c1
-rw-r--r--fs/ncpfs/inode.c8
-rw-r--r--fs/ncpfs/ioctl.c1
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/ncpfs/sock.c1
-rw-r--r--fs/ncpfs/symlink.c1
-rw-r--r--fs/nfs/Kconfig3
-rw-r--r--fs/nfs/cache_lib.c1
-rw-r--r--fs/nfs/callback.c2
-rw-r--r--fs/nfs/callback.h8
-rw-r--r--fs/nfs/callback_proc.c166
-rw-r--r--fs/nfs/callback_xdr.c107
-rw-r--r--fs/nfs/client.c52
-rw-r--r--fs/nfs/delegation.c1
-rw-r--r--fs/nfs/delegation.h6
-rw-r--r--fs/nfs/dir.c7
-rw-r--r--fs/nfs/direct.c4
-rw-r--r--fs/nfs/dns_resolve.c19
-rw-r--r--fs/nfs/file.c35
-rw-r--r--fs/nfs/fscache.c10
-rw-r--r--fs/nfs/inode.c117
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/iostat.h4
-rw-r--r--fs/nfs/mount_clnt.c2
-rw-r--r--fs/nfs/namespace.c1
-rw-r--r--fs/nfs/nfs2xdr.c3
-rw-r--r--fs/nfs/nfs3acl.c1
-rw-r--r--fs/nfs/nfs3proc.c10
-rw-r--r--fs/nfs/nfs3xdr.c1
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4namespace.c1
-rw-r--r--fs/nfs/nfs4proc.c196
-rw-r--r--fs/nfs/nfs4renewd.c24
-rw-r--r--fs/nfs/nfs4state.c120
-rw-r--r--fs/nfs/nfs4xdr.c19
-rw-r--r--fs/nfs/pagelist.c40
-rw-r--r--fs/nfs/proc.c42
-rw-r--r--fs/nfs/super.c41
-rw-r--r--fs/nfs/symlink.c3
-rw-r--r--fs/nfs/sysctl.c2
-rw-r--r--fs/nfs/write.c297
-rw-r--r--fs/nfs_common/nfsacl.c1
-rw-r--r--fs/nfsctl.c5
-rw-r--r--fs/nfsd/export.c11
-rw-r--r--fs/nfsd/nfs2acl.c1
-rw-r--r--fs/nfsd/nfs3acl.c1
-rw-r--r--fs/nfsd/nfs4acl.c1
-rw-r--r--fs/nfsd/nfs4callback.c6
-rw-r--r--fs/nfsd/nfs4idmap.c1
-rw-r--r--fs/nfsd/nfs4proc.c1
-rw-r--r--fs/nfsd/nfs4recover.c5
-rw-r--r--fs/nfsd/nfs4state.c7
-rw-r--r--fs/nfsd/nfs4xdr.c25
-rw-r--r--fs/nfsd/nfscache.c2
-rw-r--r--fs/nfsd/nfsctl.c25
-rw-r--r--fs/nfsd/vfs.c168
-rw-r--r--fs/nilfs2/alloc.c3
-rw-r--r--fs/nilfs2/alloc.h2
-rw-r--r--fs/nilfs2/bmap.c4
-rw-r--r--fs/nilfs2/btnode.c1
-rw-r--r--fs/nilfs2/btree.c2
-rw-r--r--fs/nilfs2/cpfile.c31
-rw-r--r--fs/nilfs2/dat.c5
-rw-r--r--fs/nilfs2/dir.c16
-rw-r--r--fs/nilfs2/direct.c17
-rw-r--r--fs/nilfs2/gcinode.c5
-rw-r--r--fs/nilfs2/inode.c1
-rw-r--r--fs/nilfs2/ioctl.c71
-rw-r--r--fs/nilfs2/mdt.c1
-rw-r--r--fs/nilfs2/namei.c13
-rw-r--r--fs/nilfs2/nilfs.h4
-rw-r--r--fs/nilfs2/page.c5
-rw-r--r--fs/nilfs2/recovery.c42
-rw-r--r--fs/nilfs2/segbuf.c37
-rw-r--r--fs/nilfs2/segbuf.h5
-rw-r--r--fs/nilfs2/segment.c146
-rw-r--r--fs/nilfs2/segment.h6
-rw-r--r--fs/nilfs2/sufile.c2
-rw-r--r--fs/nilfs2/super.c19
-rw-r--r--fs/nilfs2/the_nilfs.c40
-rw-r--r--fs/nilfs2/the_nilfs.h4
-rw-r--r--fs/notify/fsnotify.c1
-rw-r--r--fs/notify/inode_mark.c1
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c2
-rw-r--r--fs/notify/inotify/inotify_user.c63
-rw-r--r--fs/ntfs/ChangeLog1702
-rw-r--r--fs/ntfs/aops.c1
-rw-r--r--fs/ntfs/attrib.c1
-rw-r--r--fs/ntfs/compress.c1
-rw-r--r--fs/ntfs/dir.c3
-rw-r--r--fs/ntfs/file.c3
-rw-r--r--fs/ntfs/index.c2
-rw-r--r--fs/ntfs/inode.c2
-rw-r--r--fs/ntfs/inode.h4
-rw-r--r--fs/ntfs/mft.c1
-rw-r--r--fs/ntfs/namei.c1
-rw-r--r--fs/ntfs/super.c33
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/acl.c78
-rw-r--r--fs/ocfs2/alloc.c18
-rw-r--r--fs/ocfs2/aops.c20
-rw-r--r--fs/ocfs2/buffer_head_io.c3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c7
-rw-r--r--fs/ocfs2/cluster/masklog.c3
-rw-r--r--fs/ocfs2/cluster/masklog.h7
-rw-r--r--fs/ocfs2/cluster/nodemanager.c1
-rw-r--r--fs/ocfs2/cluster/quorum.c1
-rw-r--r--fs/ocfs2/cluster/tcp.c14
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h4
-rw-r--r--fs/ocfs2/dir.c39
-rw-r--r--fs/ocfs2/dlm/Makefile3
-rw-r--r--fs/ocfs2/dlm/dlmapi.h2
-rw-r--r--fs/ocfs2/dlm/dlmast.c3
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c3
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c2
-rw-r--r--fs/ocfs2/dlm/dlmlock.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c42
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c149
-rw-r--r--fs/ocfs2/dlm/dlmthread.c1
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c9
-rw-r--r--fs/ocfs2/dlmfs/Makefile5
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c (renamed from fs/ocfs2/dlm/dlmfs.c)127
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.c (renamed from fs/ocfs2/dlm/dlmfsver.c)0
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.h (renamed from fs/ocfs2/dlm/dlmfsver.h)0
-rw-r--r--fs/ocfs2/dlmfs/userdlm.c (renamed from fs/ocfs2/dlm/userdlm.c)308
-rw-r--r--fs/ocfs2/dlmfs/userdlm.h (renamed from fs/ocfs2/dlm/userdlm.h)16
-rw-r--r--fs/ocfs2/dlmglue.c371
-rw-r--r--fs/ocfs2/export.c2
-rw-r--r--fs/ocfs2/extent_map.c5
-rw-r--r--fs/ocfs2/file.c60
-rw-r--r--fs/ocfs2/heartbeat.c1
-rw-r--r--fs/ocfs2/inode.c26
-rw-r--r--fs/ocfs2/ioctl.c14
-rw-r--r--fs/ocfs2/ioctl.h6
-rw-r--r--fs/ocfs2/journal.c2
-rw-r--r--fs/ocfs2/localalloc.c12
-rw-r--r--fs/ocfs2/locks.c2
-rw-r--r--fs/ocfs2/mmap.c1
-rw-r--r--fs/ocfs2/namei.c80
-rw-r--r--fs/ocfs2/ocfs2.h38
-rw-r--r--fs/ocfs2/ocfs2_fs.h68
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h79
-rw-r--r--fs/ocfs2/ocfs2_lockingver.h2
-rw-r--r--fs/ocfs2/quota_global.c8
-rw-r--r--fs/ocfs2/quota_local.c3
-rw-r--r--fs/ocfs2/refcounttree.c22
-rw-r--r--fs/ocfs2/stack_o2cb.c50
-rw-r--r--fs/ocfs2/stack_user.c50
-rw-r--r--fs/ocfs2/stackglue.c98
-rw-r--r--fs/ocfs2/stackglue.h95
-rw-r--r--fs/ocfs2/suballoc.c300
-rw-r--r--fs/ocfs2/suballoc.h6
-rw-r--r--fs/ocfs2/super.c12
-rw-r--r--fs/ocfs2/symlink.c10
-rw-r--r--fs/ocfs2/sysfile.c1
-rw-r--r--fs/ocfs2/uptodate.c4
-rw-r--r--fs/ocfs2/xattr.c2186
-rw-r--r--fs/omfs/inode.c11
-rw-r--r--fs/open.c9
-rw-r--r--fs/partitions/check.c8
-rw-r--r--fs/partitions/efi.c1
-rw-r--r--fs/partitions/msdos.c85
-rw-r--r--fs/pnode.c28
-rw-r--r--fs/pnode.h5
-rw-r--r--fs/proc/array.c96
-rw-r--r--fs/proc/base.c49
-rw-r--r--fs/proc/generic.c39
-rw-r--r--fs/proc/inode.c1
-rw-r--r--fs/proc/kcore.c3
-rw-r--r--fs/proc/kmsg.c14
-rw-r--r--fs/proc/nommu.c1
-rw-r--r--fs/proc/proc_devtree.c8
-rw-r--r--fs/proc/proc_net.c1
-rw-r--r--fs/proc/root.c6
-rw-r--r--fs/proc/stat.c1
-rw-r--r--fs/proc/task_mmu.c131
-rw-r--r--fs/proc/task_nommu.c1
-rw-r--r--fs/proc/vmcore.c1
-rw-r--r--fs/qnx4/inode.c3
-rw-r--r--fs/quota/Kconfig13
-rw-r--r--fs/quota/Makefile2
-rw-r--r--fs/quota/compat.c118
-rw-r--r--fs/quota/dquot.c443
-rw-r--r--fs/quota/netlink.c96
-rw-r--r--fs/quota/quota.c735
-rw-r--r--fs/ramfs/file-nommu.c27
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/reiserfs/bitmap.c15
-rw-r--r--fs/reiserfs/dir.c3
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/fix_node.c1
-rw-r--r--fs/reiserfs/inode.c51
-rw-r--r--fs/reiserfs/ioctl.c3
-rw-r--r--fs/reiserfs/journal.c36
-rw-r--r--fs/reiserfs/lock.c9
-rw-r--r--fs/reiserfs/namei.c31
-rw-r--r--fs/reiserfs/stree.c20
-rw-r--r--fs/reiserfs/super.c26
-rw-r--r--fs/reiserfs/xattr.c62
-rw-r--r--fs/reiserfs/xattr_acl.c3
-rw-r--r--fs/reiserfs/xattr_security.c3
-rw-r--r--fs/romfs/super.c1
-rw-r--r--fs/select.c19
-rw-r--r--fs/seq_file.c130
-rw-r--r--fs/signalfd.c1
-rw-r--r--fs/smbfs/file.c1
-rw-r--r--fs/smbfs/inode.c8
-rw-r--r--fs/smbfs/smbiod.c1
-rw-r--r--fs/smbfs/symlink.c1
-rw-r--r--fs/splice.c1
-rw-r--r--fs/squashfs/Makefile2
-rw-r--r--fs/squashfs/block.c81
-rw-r--r--fs/squashfs/cache.c1
-rw-r--r--fs/squashfs/decompressor.c68
-rw-r--r--fs/squashfs/decompressor.h55
-rw-r--r--fs/squashfs/dir.c1
-rw-r--r--fs/squashfs/export.c1
-rw-r--r--fs/squashfs/file.c1
-rw-r--r--fs/squashfs/fragment.c1
-rw-r--r--fs/squashfs/id.c1
-rw-r--r--fs/squashfs/inode.c1
-rw-r--r--fs/squashfs/namei.c1
-rw-r--r--fs/squashfs/squashfs.h8
-rw-r--r--fs/squashfs/squashfs_fs.h6
-rw-r--r--fs/squashfs/squashfs_fs_sb.h40
-rw-r--r--fs/squashfs/super.c53
-rw-r--r--fs/squashfs/symlink.c2
-rw-r--r--fs/squashfs/zlib_wrapper.c152
-rw-r--r--fs/super.c29
-rw-r--r--fs/sync.c18
-rw-r--r--fs/sysfs/bin.c50
-rw-r--r--fs/sysfs/dir.c144
-rw-r--r--fs/sysfs/file.c47
-rw-r--r--fs/sysfs/inode.c49
-rw-r--r--fs/sysfs/mount.c5
-rw-r--r--fs/sysfs/symlink.c39
-rw-r--r--fs/sysfs/sysfs.h28
-rw-r--r--fs/sysv/inode.c10
-rw-r--r--fs/sysv/sysv.h2
-rw-r--r--fs/timerfd.c1
-rw-r--r--fs/ubifs/commit.c1
-rw-r--r--fs/ubifs/debug.c1
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/file.c9
-rw-r--r--fs/ubifs/gc.c97
-rw-r--r--fs/ubifs/io.c1
-rw-r--r--fs/ubifs/lpt.c1
-rw-r--r--fs/ubifs/lpt_commit.c1
-rw-r--r--fs/ubifs/recovery.c1
-rw-r--r--fs/ubifs/sb.c1
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/ubifs/tnc.c1
-rw-r--r--fs/ubifs/ubifs.h1
-rw-r--r--fs/ubifs/xattr.c1
-rw-r--r--fs/udf/balloc.c96
-rw-r--r--fs/udf/dir.c4
-rw-r--r--fs/udf/file.c28
-rw-r--r--fs/udf/ialloc.c14
-rw-r--r--fs/udf/inode.c50
-rw-r--r--fs/udf/namei.c46
-rw-r--r--fs/udf/partition.c1
-rw-r--r--fs/udf/symlink.c11
-rw-r--r--fs/udf/udfdecl.h5
-rw-r--r--fs/udf/unicode.c1
-rw-r--r--fs/ufs/balloc.c24
-rw-r--r--fs/ufs/dir.c10
-rw-r--r--fs/ufs/file.c3
-rw-r--r--fs/ufs/ialloc.c11
-rw-r--r--fs/ufs/inode.c9
-rw-r--r--fs/ufs/namei.c18
-rw-r--r--fs/ufs/super.c9
-rw-r--r--fs/ufs/truncate.c10
-rw-r--r--fs/ufs/ufs.h6
-rw-r--r--fs/ufs/ufs_fs.h15
-rw-r--r--fs/xattr_acl.c2
-rw-r--r--fs/xfs/Makefile3
-rw-r--r--fs/xfs/linux-2.6/kmem.c57
-rw-r--r--fs/xfs/linux-2.6/kmem.h21
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c15
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c229
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c403
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h52
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c20
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c854
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c22
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.h12
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c15
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c852
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.h32
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c19
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c190
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c339
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h3
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.c16
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h1222
-rw-r--r--fs/xfs/linux-2.6/xfs_xattr.c27
-rw-r--r--fs/xfs/quota/xfs_dquot.c47
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c99
-rw-r--r--fs/xfs/quota/xfs_dquot_item.h4
-rw-r--r--fs/xfs/quota/xfs_qm.c40
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c6
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c49
-rw-r--r--fs/xfs/xfs_acl.h4
-rw-r--r--fs/xfs/xfs_ag.h16
-rw-r--r--fs/xfs/xfs_alloc.c134
-rw-r--r--fs/xfs/xfs_alloc_btree.c9
-rw-r--r--fs/xfs/xfs_attr.c52
-rw-r--r--fs/xfs/xfs_attr.h3
-rw-r--r--fs/xfs/xfs_attr_leaf.c30
-rw-r--r--fs/xfs/xfs_attr_sf.h2
-rw-r--r--fs/xfs/xfs_bmap.c221
-rw-r--r--fs/xfs/xfs_bmap_btree.c2
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c4
-rw-r--r--fs/xfs/xfs_buf_item.c72
-rw-r--r--fs/xfs/xfs_da_btree.c4
-rw-r--r--fs/xfs/xfs_da_btree.h5
-rw-r--r--fs/xfs/xfs_dfrag.c159
-rw-r--r--fs/xfs/xfs_dfrag.h3
-rw-r--r--fs/xfs/xfs_dir2.c8
-rw-r--r--fs/xfs/xfs_dir2.h4
-rw-r--r--fs/xfs/xfs_dir2_block.c9
-rw-r--r--fs/xfs/xfs_dir2_leaf.c2
-rw-r--r--fs/xfs/xfs_dir2_node.c2
-rw-r--r--fs/xfs/xfs_dir2_node.h2
-rw-r--r--fs/xfs/xfs_dir2_sf.c2
-rw-r--r--fs/xfs/xfs_extfree_item.c4
-rw-r--r--fs/xfs/xfs_filestream.c42
-rw-r--r--fs/xfs/xfs_filestream.h28
-rw-r--r--fs/xfs/xfs_fs.h3
-rw-r--r--fs/xfs/xfs_fsops.c42
-rw-r--r--fs/xfs/xfs_ialloc.c62
-rw-r--r--fs/xfs/xfs_iget.c30
-rw-r--r--fs/xfs/xfs_inode.c199
-rw-r--r--fs/xfs/xfs_inode.h14
-rw-r--r--fs/xfs/xfs_inode_item.c147
-rw-r--r--fs/xfs/xfs_inode_item.h6
-rw-r--r--fs/xfs/xfs_itable.c14
-rw-r--r--fs/xfs/xfs_log.c525
-rw-r--r--fs/xfs/xfs_log.h35
-rw-r--r--fs/xfs/xfs_log_priv.h5
-rw-r--r--fs/xfs/xfs_log_recover.c222
-rw-r--r--fs/xfs/xfs_log_recover.h23
-rw-r--r--fs/xfs/xfs_mount.c236
-rw-r--r--fs/xfs/xfs_mount.h31
-rw-r--r--fs/xfs/xfs_mru_cache.c2
-rw-r--r--fs/xfs/xfs_mru_cache.h1
-rw-r--r--fs/xfs/xfs_quota.h9
-rw-r--r--fs/xfs/xfs_rtalloc.c2
-rw-r--r--fs/xfs/xfs_rw.c155
-rw-r--r--fs/xfs/xfs_rw.h4
-rw-r--r--fs/xfs/xfs_trans.c9
-rw-r--r--fs/xfs/xfs_trans.h5
-rw-r--r--fs/xfs/xfs_trans_ail.c34
-rw-r--r--fs/xfs/xfs_trans_buf.c243
-rw-r--r--fs/xfs/xfs_types.h4
-rw-r--r--fs/xfs/xfs_vnodeops.c213
-rw-r--r--fs/xfs/xfs_vnodeops.h25
779 files changed, 54723 insertions, 14504 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index e777961939f3..0dbe0d139ac2 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/file.h> 24#include <linux/file.h>
25#include <linux/slab.h>
25#include <linux/stat.h> 26#include <linux/stat.h>
26#include <linux/sched.h> 27#include <linux/sched.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 14d944204571..7317b39b2815 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -24,6 +24,7 @@
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/errno.h> 25#include <linux/errno.h>
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/slab.h>
27#include <linux/sched.h> 28#include <linux/sched.h>
28#include <linux/idr.h> 29#include <linux/idr.h>
29#include <net/9p/9p.h> 30#include <net/9p/9p.h>
@@ -110,7 +111,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
110{ 111{
111 int i, n, l, clone, any, access; 112 int i, n, l, clone, any, access;
112 u32 uid; 113 u32 uid;
113 struct p9_fid *fid; 114 struct p9_fid *fid, *old_fid = NULL;
114 struct dentry *d, *ds; 115 struct dentry *d, *ds;
115 struct v9fs_session_info *v9ses; 116 struct v9fs_session_info *v9ses;
116 char **wnames, *uname; 117 char **wnames, *uname;
@@ -151,7 +152,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
151 if (access == V9FS_ACCESS_SINGLE) 152 if (access == V9FS_ACCESS_SINGLE)
152 return ERR_PTR(-EPERM); 153 return ERR_PTR(-EPERM);
153 154
154 if (v9fs_extended(v9ses)) 155 if (v9fs_proto_dotu(v9ses))
155 uname = NULL; 156 uname = NULL;
156 else 157 else
157 uname = v9ses->uname; 158 uname = v9ses->uname;
@@ -183,10 +184,18 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
183 l = min(n - i, P9_MAXWELEM); 184 l = min(n - i, P9_MAXWELEM);
184 fid = p9_client_walk(fid, l, &wnames[i], clone); 185 fid = p9_client_walk(fid, l, &wnames[i], clone);
185 if (IS_ERR(fid)) { 186 if (IS_ERR(fid)) {
187 if (old_fid) {
188 /*
189 * If we fail, clunk fid which are mapping
190 * to path component and not the last component
191 * of the path.
192 */
193 p9_client_clunk(old_fid);
194 }
186 kfree(wnames); 195 kfree(wnames);
187 return fid; 196 return fid;
188 } 197 }
189 198 old_fid = fid;
190 i += l; 199 i += l;
191 clone = 0; 200 clone = 0;
192 } 201 }
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index cf62b05e296a..f8b86e92cd66 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -29,6 +29,7 @@
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/parser.h> 30#include <linux/parser.h>
31#include <linux/idr.h> 31#include <linux/idr.h>
32#include <linux/slab.h>
32#include <net/9p/9p.h> 33#include <net/9p/9p.h>
33#include <net/9p/client.h> 34#include <net/9p/client.h>
34#include <net/9p/transport.h> 35#include <net/9p/transport.h>
@@ -84,7 +85,7 @@ static const match_table_t tokens = {
84 85
85static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) 86static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
86{ 87{
87 char *options; 88 char *options, *tmp_options;
88 substring_t args[MAX_OPT_ARGS]; 89 substring_t args[MAX_OPT_ARGS];
89 char *p; 90 char *p;
90 int option = 0; 91 int option = 0;
@@ -102,9 +103,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
102 if (!opts) 103 if (!opts)
103 return 0; 104 return 0;
104 105
105 options = kstrdup(opts, GFP_KERNEL); 106 tmp_options = kstrdup(opts, GFP_KERNEL);
106 if (!options) 107 if (!tmp_options) {
108 ret = -ENOMEM;
107 goto fail_option_alloc; 109 goto fail_option_alloc;
110 }
111 options = tmp_options;
108 112
109 while ((p = strsep(&options, ",")) != NULL) { 113 while ((p = strsep(&options, ",")) != NULL) {
110 int token; 114 int token;
@@ -159,8 +163,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
159 break; 163 break;
160 case Opt_cache: 164 case Opt_cache:
161 s = match_strdup(&args[0]); 165 s = match_strdup(&args[0]);
162 if (!s) 166 if (!s) {
163 goto fail_option_alloc; 167 ret = -ENOMEM;
168 P9_DPRINTK(P9_DEBUG_ERROR,
169 "problem allocating copy of cache arg\n");
170 goto free_and_return;
171 }
164 172
165 if (strcmp(s, "loose") == 0) 173 if (strcmp(s, "loose") == 0)
166 v9ses->cache = CACHE_LOOSE; 174 v9ses->cache = CACHE_LOOSE;
@@ -173,8 +181,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
173 181
174 case Opt_access: 182 case Opt_access:
175 s = match_strdup(&args[0]); 183 s = match_strdup(&args[0]);
176 if (!s) 184 if (!s) {
177 goto fail_option_alloc; 185 ret = -ENOMEM;
186 P9_DPRINTK(P9_DEBUG_ERROR,
187 "problem allocating copy of access arg\n");
188 goto free_and_return;
189 }
178 190
179 v9ses->flags &= ~V9FS_ACCESS_MASK; 191 v9ses->flags &= ~V9FS_ACCESS_MASK;
180 if (strcmp(s, "user") == 0) 192 if (strcmp(s, "user") == 0)
@@ -194,13 +206,11 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
194 continue; 206 continue;
195 } 207 }
196 } 208 }
197 kfree(options);
198 return ret;
199 209
210free_and_return:
211 kfree(tmp_options);
200fail_option_alloc: 212fail_option_alloc:
201 P9_DPRINTK(P9_DEBUG_ERROR, 213 return ret;
202 "failed to allocate copy of option argument\n");
203 return -ENOMEM;
204} 214}
205 215
206/** 216/**
@@ -228,11 +238,18 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
228 return ERR_PTR(-ENOMEM); 238 return ERR_PTR(-ENOMEM);
229 } 239 }
230 240
241 rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
242 if (rc) {
243 __putname(v9ses->aname);
244 __putname(v9ses->uname);
245 return ERR_PTR(rc);
246 }
247
231 spin_lock(&v9fs_sessionlist_lock); 248 spin_lock(&v9fs_sessionlist_lock);
232 list_add(&v9ses->slist, &v9fs_sessionlist); 249 list_add(&v9ses->slist, &v9fs_sessionlist);
233 spin_unlock(&v9fs_sessionlist_lock); 250 spin_unlock(&v9fs_sessionlist_lock);
234 251
235 v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER; 252 v9ses->flags = V9FS_ACCESS_USER;
236 strcpy(v9ses->uname, V9FS_DEFUSER); 253 strcpy(v9ses->uname, V9FS_DEFUSER);
237 strcpy(v9ses->aname, V9FS_DEFANAME); 254 strcpy(v9ses->aname, V9FS_DEFANAME);
238 v9ses->uid = ~0; 255 v9ses->uid = ~0;
@@ -253,13 +270,15 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
253 goto error; 270 goto error;
254 } 271 }
255 272
256 if (!v9ses->clnt->dotu) 273 if (p9_is_proto_dotl(v9ses->clnt))
257 v9ses->flags &= ~V9FS_EXTENDED; 274 v9ses->flags |= V9FS_PROTO_2000L;
275 else if (p9_is_proto_dotu(v9ses->clnt))
276 v9ses->flags |= V9FS_PROTO_2000U;
258 277
259 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; 278 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
260 279
261 /* for legacy mode, fall back to V9FS_ACCESS_ANY */ 280 /* for legacy mode, fall back to V9FS_ACCESS_ANY */
262 if (!v9fs_extended(v9ses) && 281 if (!v9fs_proto_dotu(v9ses) &&
263 ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { 282 ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
264 283
265 v9ses->flags &= ~V9FS_ACCESS_MASK; 284 v9ses->flags &= ~V9FS_ACCESS_MASK;
@@ -289,6 +308,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
289 return fid; 308 return fid;
290 309
291error: 310error:
311 bdi_destroy(&v9ses->bdi);
292 return ERR_PTR(retval); 312 return ERR_PTR(retval);
293} 313}
294 314
@@ -314,6 +334,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
314 __putname(v9ses->uname); 334 __putname(v9ses->uname);
315 __putname(v9ses->aname); 335 __putname(v9ses->aname);
316 336
337 bdi_destroy(&v9ses->bdi);
338
317 spin_lock(&v9fs_sessionlist_lock); 339 spin_lock(&v9fs_sessionlist_lock);
318 list_del(&v9ses->slist); 340 list_del(&v9ses->slist);
319 spin_unlock(&v9fs_sessionlist_lock); 341 spin_unlock(&v9fs_sessionlist_lock);
@@ -331,6 +353,19 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
331 p9_client_disconnect(v9ses->clnt); 353 p9_client_disconnect(v9ses->clnt);
332} 354}
333 355
356/**
357 * v9fs_session_begin_cancel - Begin terminate of a session
358 * @v9ses: session to terminate
359 *
360 * After this call we don't allow any request other than clunk.
361 */
362
363void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
364{
365 P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
366 p9_client_begin_disconnect(v9ses->clnt);
367}
368
334extern int v9fs_error_init(void); 369extern int v9fs_error_init(void);
335 370
336static struct kobject *v9fs_kobj; 371static struct kobject *v9fs_kobj;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 019f4ccb70c1..bec4d0bcb458 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -20,10 +20,12 @@
20 * Boston, MA 02111-1301 USA 20 * Boston, MA 02111-1301 USA
21 * 21 *
22 */ 22 */
23#include <linux/backing-dev.h>
23 24
24/** 25/**
25 * enum p9_session_flags - option flags for each 9P session 26 * enum p9_session_flags - option flags for each 9P session
26 * @V9FS_EXTENDED: whether or not to use 9P2000.u extensions 27 * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions
28 * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
27 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy 29 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
28 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) 30 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
29 * @V9FS_ACCESS_ANY: use a single attach for all users 31 * @V9FS_ACCESS_ANY: use a single attach for all users
@@ -32,11 +34,12 @@
32 * Session flags reflect options selected by users at mount time 34 * Session flags reflect options selected by users at mount time
33 */ 35 */
34enum p9_session_flags { 36enum p9_session_flags {
35 V9FS_EXTENDED = 0x01, 37 V9FS_PROTO_2000U = 0x01,
36 V9FS_ACCESS_SINGLE = 0x02, 38 V9FS_PROTO_2000L = 0x02,
37 V9FS_ACCESS_USER = 0x04, 39 V9FS_ACCESS_SINGLE = 0x04,
38 V9FS_ACCESS_ANY = 0x06, 40 V9FS_ACCESS_USER = 0x08,
39 V9FS_ACCESS_MASK = 0x06, 41 V9FS_ACCESS_ANY = 0x0C,
42 V9FS_ACCESS_MASK = 0x0C,
40}; 43};
41 44
42/* possible values of ->cache */ 45/* possible values of ->cache */
@@ -100,12 +103,14 @@ struct v9fs_session_info {
100 u32 uid; /* if ACCESS_SINGLE, the uid that has access */ 103 u32 uid; /* if ACCESS_SINGLE, the uid that has access */
101 struct p9_client *clnt; /* 9p client */ 104 struct p9_client *clnt; /* 9p client */
102 struct list_head slist; /* list of sessions registered with v9fs */ 105 struct list_head slist; /* list of sessions registered with v9fs */
106 struct backing_dev_info bdi;
103}; 107};
104 108
105struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, 109struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
106 char *); 110 char *);
107void v9fs_session_close(struct v9fs_session_info *v9ses); 111void v9fs_session_close(struct v9fs_session_info *v9ses);
108void v9fs_session_cancel(struct v9fs_session_info *v9ses); 112void v9fs_session_cancel(struct v9fs_session_info *v9ses);
113void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
109 114
110#define V9FS_MAGIC 0x01021997 115#define V9FS_MAGIC 0x01021997
111 116
@@ -121,7 +126,12 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
121 return (inode->i_sb->s_fs_info); 126 return (inode->i_sb->s_fs_info);
122} 127}
123 128
124static inline int v9fs_extended(struct v9fs_session_info *v9ses) 129static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
125{ 130{
126 return v9ses->flags & V9FS_EXTENDED; 131 return v9ses->flags & V9FS_PROTO_2000U;
132}
133
134static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
135{
136 return v9ses->flags & V9FS_PROTO_2000L;
127} 137}
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 3a7560e35865..ed835836e0dc 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -60,3 +60,4 @@ void v9fs_dentry_release(struct dentry *);
60int v9fs_uflags2omode(int uflags, int extended); 60int v9fs_uflags2omode(int uflags, int extended);
61 61
62ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); 62ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
63void v9fs_blank_wstat(struct p9_wstat *wstat);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index d74325295b1e..cbf4e50f3933 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -34,6 +34,7 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/idr.h> 35#include <linux/idr.h>
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <linux/slab.h>
37#include <net/9p/9p.h> 38#include <net/9p/9p.h>
38#include <net/9p/client.h> 39#include <net/9p/client.h>
39 40
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 15cce53bf61e..0adfd64dfcee 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -32,6 +32,7 @@
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/inet.h> 33#include <linux/inet.h>
34#include <linux/idr.h> 34#include <linux/idr.h>
35#include <linux/slab.h>
35#include <net/9p/9p.h> 36#include <net/9p/9p.h>
36#include <net/9p/client.h> 37#include <net/9p/client.h>
37 38
@@ -76,6 +77,15 @@ static inline int dt_type(struct p9_wstat *mistat)
76 return rettype; 77 return rettype;
77} 78}
78 79
80static void p9stat_init(struct p9_wstat *stbuf)
81{
82 stbuf->name = NULL;
83 stbuf->uid = NULL;
84 stbuf->gid = NULL;
85 stbuf->muid = NULL;
86 stbuf->extension = NULL;
87}
88
79/** 89/**
80 * v9fs_dir_readdir - read a directory 90 * v9fs_dir_readdir - read a directory
81 * @filp: opened file structure 91 * @filp: opened file structure
@@ -121,6 +131,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
121 rdir = (struct p9_rdir *) fid->rdir; 131 rdir = (struct p9_rdir *) fid->rdir;
122 132
123 err = mutex_lock_interruptible(&rdir->mutex); 133 err = mutex_lock_interruptible(&rdir->mutex);
134 if (err)
135 return err;
124 while (err == 0) { 136 while (err == 0) {
125 if (rdir->tail == rdir->head) { 137 if (rdir->tail == rdir->head) {
126 err = v9fs_file_readn(filp, rdir->buf, NULL, 138 err = v9fs_file_readn(filp, rdir->buf, NULL,
@@ -131,11 +143,11 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
131 rdir->head = 0; 143 rdir->head = 0;
132 rdir->tail = err; 144 rdir->tail = err;
133 } 145 }
134
135 while (rdir->head < rdir->tail) { 146 while (rdir->head < rdir->tail) {
147 p9stat_init(&st);
136 err = p9stat_read(rdir->buf + rdir->head, 148 err = p9stat_read(rdir->buf + rdir->head,
137 buflen - rdir->head, &st, 149 buflen - rdir->head, &st,
138 fid->clnt->dotu); 150 fid->clnt->proto_version);
139 if (err) { 151 if (err) {
140 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); 152 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
141 err = -EIO; 153 err = -EIO;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3902bf43a088..df52d488d2a6 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
61 61
62 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file); 62 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
63 v9ses = v9fs_inode2v9ses(inode); 63 v9ses = v9fs_inode2v9ses(inode);
64 omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses)); 64 omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses));
65 fid = file->private_data; 65 fid = file->private_data;
66 if (!fid) { 66 if (!fid) {
67 fid = v9fs_fid_clone(file->f_path.dentry); 67 fid = v9fs_fid_clone(file->f_path.dentry);
@@ -77,7 +77,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
77 i_size_write(inode, 0); 77 i_size_write(inode, 0);
78 inode->i_blocks = 0; 78 inode->i_blocks = 0;
79 } 79 }
80 if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses))) 80 if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses)))
81 generic_file_llseek(file, 0, SEEK_END); 81 generic_file_llseek(file, 0, SEEK_END);
82 } 82 }
83 83
@@ -114,7 +114,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
114 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); 114 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
115 115
116 /* No mandatory locks */ 116 /* No mandatory locks */
117 if (__mandatory_lock(inode)) 117 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
118 return -ENOLCK; 118 return -ENOLCK;
119 119
120 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { 120 if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
@@ -215,7 +215,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
215 struct p9_fid *fid; 215 struct p9_fid *fid;
216 struct p9_client *clnt; 216 struct p9_client *clnt;
217 struct inode *inode = filp->f_path.dentry->d_inode; 217 struct inode *inode = filp->f_path.dentry->d_inode;
218 int origin = *offset; 218 loff_t origin = *offset;
219 unsigned long pg_start, pg_end; 219 unsigned long pg_start, pg_end;
220 220
221 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, 221 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
@@ -257,6 +257,23 @@ v9fs_file_write(struct file *filp, const char __user * data,
257 return total; 257 return total;
258} 258}
259 259
260static int v9fs_file_fsync(struct file *filp, struct dentry *dentry,
261 int datasync)
262{
263 struct p9_fid *fid;
264 struct p9_wstat wstat;
265 int retval;
266
267 P9_DPRINTK(P9_DEBUG_VFS, "filp %p dentry %p datasync %x\n", filp,
268 dentry, datasync);
269
270 fid = filp->private_data;
271 v9fs_blank_wstat(&wstat);
272
273 retval = p9_client_wstat(fid, &wstat);
274 return retval;
275}
276
260static const struct file_operations v9fs_cached_file_operations = { 277static const struct file_operations v9fs_cached_file_operations = {
261 .llseek = generic_file_llseek, 278 .llseek = generic_file_llseek,
262 .read = do_sync_read, 279 .read = do_sync_read,
@@ -266,6 +283,7 @@ static const struct file_operations v9fs_cached_file_operations = {
266 .release = v9fs_dir_release, 283 .release = v9fs_dir_release,
267 .lock = v9fs_file_lock, 284 .lock = v9fs_file_lock,
268 .mmap = generic_file_readonly_mmap, 285 .mmap = generic_file_readonly_mmap,
286 .fsync = v9fs_file_fsync,
269}; 287};
270 288
271const struct file_operations v9fs_file_operations = { 289const struct file_operations v9fs_file_operations = {
@@ -276,4 +294,5 @@ const struct file_operations v9fs_file_operations = {
276 .release = v9fs_dir_release, 294 .release = v9fs_dir_release,
277 .lock = v9fs_file_lock, 295 .lock = v9fs_file_lock,
278 .mmap = generic_file_readonly_mmap, 296 .mmap = generic_file_readonly_mmap,
297 .fsync = v9fs_file_fsync,
279}; 298};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 18f74ec4dce9..f2434fc9d2c4 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -34,6 +34,7 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/idr.h> 35#include <linux/idr.h>
36#include <linux/sched.h> 36#include <linux/sched.h>
37#include <linux/slab.h>
37#include <net/9p/9p.h> 38#include <net/9p/9p.h>
38#include <net/9p/client.h> 39#include <net/9p/client.h>
39 40
@@ -60,7 +61,7 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
60 res = mode & 0777; 61 res = mode & 0777;
61 if (S_ISDIR(mode)) 62 if (S_ISDIR(mode))
62 res |= P9_DMDIR; 63 res |= P9_DMDIR;
63 if (v9fs_extended(v9ses)) { 64 if (v9fs_proto_dotu(v9ses)) {
64 if (S_ISLNK(mode)) 65 if (S_ISLNK(mode))
65 res |= P9_DMSYMLINK; 66 res |= P9_DMSYMLINK;
66 if (v9ses->nodev == 0) { 67 if (v9ses->nodev == 0) {
@@ -102,21 +103,21 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
102 103
103 if ((mode & P9_DMDIR) == P9_DMDIR) 104 if ((mode & P9_DMDIR) == P9_DMDIR)
104 res |= S_IFDIR; 105 res |= S_IFDIR;
105 else if ((mode & P9_DMSYMLINK) && (v9fs_extended(v9ses))) 106 else if ((mode & P9_DMSYMLINK) && (v9fs_proto_dotu(v9ses)))
106 res |= S_IFLNK; 107 res |= S_IFLNK;
107 else if ((mode & P9_DMSOCKET) && (v9fs_extended(v9ses)) 108 else if ((mode & P9_DMSOCKET) && (v9fs_proto_dotu(v9ses))
108 && (v9ses->nodev == 0)) 109 && (v9ses->nodev == 0))
109 res |= S_IFSOCK; 110 res |= S_IFSOCK;
110 else if ((mode & P9_DMNAMEDPIPE) && (v9fs_extended(v9ses)) 111 else if ((mode & P9_DMNAMEDPIPE) && (v9fs_proto_dotu(v9ses))
111 && (v9ses->nodev == 0)) 112 && (v9ses->nodev == 0))
112 res |= S_IFIFO; 113 res |= S_IFIFO;
113 else if ((mode & P9_DMDEVICE) && (v9fs_extended(v9ses)) 114 else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
114 && (v9ses->nodev == 0)) 115 && (v9ses->nodev == 0))
115 res |= S_IFBLK; 116 res |= S_IFBLK;
116 else 117 else
117 res |= S_IFREG; 118 res |= S_IFREG;
118 119
119 if (v9fs_extended(v9ses)) { 120 if (v9fs_proto_dotu(v9ses)) {
120 if ((mode & P9_DMSETUID) == P9_DMSETUID) 121 if ((mode & P9_DMSETUID) == P9_DMSETUID)
121 res |= S_ISUID; 122 res |= S_ISUID;
122 123
@@ -176,7 +177,7 @@ int v9fs_uflags2omode(int uflags, int extended)
176 * 177 *
177 */ 178 */
178 179
179static void 180void
180v9fs_blank_wstat(struct p9_wstat *wstat) 181v9fs_blank_wstat(struct p9_wstat *wstat)
181{ 182{
182 wstat->type = ~0; 183 wstat->type = ~0;
@@ -265,7 +266,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
265 case S_IFBLK: 266 case S_IFBLK:
266 case S_IFCHR: 267 case S_IFCHR:
267 case S_IFSOCK: 268 case S_IFSOCK:
268 if (!v9fs_extended(v9ses)) { 269 if (!v9fs_proto_dotu(v9ses)) {
269 P9_DPRINTK(P9_DEBUG_ERROR, 270 P9_DPRINTK(P9_DEBUG_ERROR,
270 "special files without extended mode\n"); 271 "special files without extended mode\n");
271 err = -EINVAL; 272 err = -EINVAL;
@@ -278,7 +279,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
278 inode->i_fop = &v9fs_file_operations; 279 inode->i_fop = &v9fs_file_operations;
279 break; 280 break;
280 case S_IFLNK: 281 case S_IFLNK:
281 if (!v9fs_extended(v9ses)) { 282 if (!v9fs_proto_dotu(v9ses)) {
282 P9_DPRINTK(P9_DEBUG_ERROR, 283 P9_DPRINTK(P9_DEBUG_ERROR,
283 "extended modes used w/o 9P2000.u\n"); 284 "extended modes used w/o 9P2000.u\n");
284 err = -EINVAL; 285 err = -EINVAL;
@@ -288,7 +289,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
288 break; 289 break;
289 case S_IFDIR: 290 case S_IFDIR:
290 inc_nlink(inode); 291 inc_nlink(inode);
291 if (v9fs_extended(v9ses)) 292 if (v9fs_proto_dotu(v9ses))
292 inode->i_op = &v9fs_dir_inode_operations_ext; 293 inode->i_op = &v9fs_dir_inode_operations_ext;
293 else 294 else
294 inode->i_op = &v9fs_dir_inode_operations; 295 inode->i_op = &v9fs_dir_inode_operations;
@@ -431,6 +432,7 @@ error:
431 432
432static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) 433static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
433{ 434{
435 int retval;
434 struct inode *file_inode; 436 struct inode *file_inode;
435 struct v9fs_session_info *v9ses; 437 struct v9fs_session_info *v9ses;
436 struct p9_fid *v9fid; 438 struct p9_fid *v9fid;
@@ -444,7 +446,10 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
444 if (IS_ERR(v9fid)) 446 if (IS_ERR(v9fid))
445 return PTR_ERR(v9fid); 447 return PTR_ERR(v9fid);
446 448
447 return p9_client_remove(v9fid); 449 retval = p9_client_remove(v9fid);
450 if (!retval)
451 drop_nlink(file_inode);
452 return retval;
448} 453}
449 454
450static int 455static int
@@ -575,7 +580,8 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
575 flags = O_RDWR; 580 flags = O_RDWR;
576 581
577 fid = v9fs_create(v9ses, dir, dentry, NULL, perm, 582 fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
578 v9fs_uflags2omode(flags, v9fs_extended(v9ses))); 583 v9fs_uflags2omode(flags,
584 v9fs_proto_dotu(v9ses)));
579 if (IS_ERR(fid)) { 585 if (IS_ERR(fid)) {
580 err = PTR_ERR(fid); 586 err = PTR_ERR(fid);
581 fid = NULL; 587 fid = NULL;
@@ -655,6 +661,9 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
655 P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", 661 P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
656 dir, dentry->d_name.name, dentry, nameidata); 662 dir, dentry->d_name.name, dentry, nameidata);
657 663
664 if (dentry->d_name.len > NAME_MAX)
665 return ERR_PTR(-ENAMETOOLONG);
666
658 sb = dir->i_sb; 667 sb = dir->i_sb;
659 v9ses = v9fs_inode2v9ses(dir); 668 v9ses = v9fs_inode2v9ses(dir);
660 dfid = v9fs_fid_lookup(dentry->d_parent); 669 dfid = v9fs_fid_lookup(dentry->d_parent);
@@ -858,7 +867,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
858 if (iattr->ia_valid & ATTR_SIZE) 867 if (iattr->ia_valid & ATTR_SIZE)
859 wstat.length = iattr->ia_size; 868 wstat.length = iattr->ia_size;
860 869
861 if (v9fs_extended(v9ses)) { 870 if (v9fs_proto_dotu(v9ses)) {
862 if (iattr->ia_valid & ATTR_UID) 871 if (iattr->ia_valid & ATTR_UID)
863 wstat.n_uid = iattr->ia_uid; 872 wstat.n_uid = iattr->ia_uid;
864 873
@@ -886,6 +895,8 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
886 struct super_block *sb) 895 struct super_block *sb)
887{ 896{
888 char ext[32]; 897 char ext[32];
898 char tag_name[14];
899 unsigned int i_nlink;
889 struct v9fs_session_info *v9ses = sb->s_fs_info; 900 struct v9fs_session_info *v9ses = sb->s_fs_info;
890 901
891 inode->i_nlink = 1; 902 inode->i_nlink = 1;
@@ -897,11 +908,26 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
897 inode->i_uid = v9ses->dfltuid; 908 inode->i_uid = v9ses->dfltuid;
898 inode->i_gid = v9ses->dfltgid; 909 inode->i_gid = v9ses->dfltgid;
899 910
900 if (v9fs_extended(v9ses)) { 911 if (v9fs_proto_dotu(v9ses)) {
901 inode->i_uid = stat->n_uid; 912 inode->i_uid = stat->n_uid;
902 inode->i_gid = stat->n_gid; 913 inode->i_gid = stat->n_gid;
903 } 914 }
904 915 if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) {
916 if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) {
917 /*
918 * Hadlink support got added later to
919 * to the .u extension. So there can be
920 * server out there that doesn't support
921 * this even with .u extension. So check
922 * for non NULL stat->extension
923 */
924 strncpy(ext, stat->extension, sizeof(ext));
925 /* HARDLINKCOUNT %u */
926 sscanf(ext, "%13s %u", tag_name, &i_nlink);
927 if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
928 inode->i_nlink = i_nlink;
929 }
930 }
905 inode->i_mode = p9mode2unixmode(v9ses, stat->mode); 931 inode->i_mode = p9mode2unixmode(v9ses, stat->mode);
906 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) { 932 if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) {
907 char type = 0; 933 char type = 0;
@@ -976,7 +1002,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
976 if (IS_ERR(fid)) 1002 if (IS_ERR(fid))
977 return PTR_ERR(fid); 1003 return PTR_ERR(fid);
978 1004
979 if (!v9fs_extended(v9ses)) 1005 if (!v9fs_proto_dotu(v9ses))
980 return -EBADF; 1006 return -EBADF;
981 1007
982 st = p9_client_stat(fid); 1008 st = p9_client_stat(fid);
@@ -1001,44 +1027,6 @@ done:
1001} 1027}
1002 1028
1003/** 1029/**
1004 * v9fs_vfs_readlink - read a symlink's location
1005 * @dentry: dentry for symlink
1006 * @buffer: buffer to load symlink location into
1007 * @buflen: length of buffer
1008 *
1009 */
1010
1011static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer,
1012 int buflen)
1013{
1014 int retval;
1015 int ret;
1016 char *link = __getname();
1017
1018 if (unlikely(!link))
1019 return -ENOMEM;
1020
1021 if (buflen > PATH_MAX)
1022 buflen = PATH_MAX;
1023
1024 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
1025 dentry);
1026
1027 retval = v9fs_readlink(dentry, link, buflen);
1028
1029 if (retval > 0) {
1030 if ((ret = copy_to_user(buffer, link, retval)) != 0) {
1031 P9_DPRINTK(P9_DEBUG_ERROR,
1032 "problem copying to user: %d\n", ret);
1033 retval = ret;
1034 }
1035 }
1036
1037 __putname(link);
1038 return retval;
1039}
1040
1041/**
1042 * v9fs_vfs_follow_link - follow a symlink path 1030 * v9fs_vfs_follow_link - follow a symlink path
1043 * @dentry: dentry for symlink 1031 * @dentry: dentry for symlink
1044 * @nd: nameidata 1032 * @nd: nameidata
@@ -1104,7 +1092,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
1104 struct p9_fid *fid; 1092 struct p9_fid *fid;
1105 1093
1106 v9ses = v9fs_inode2v9ses(dir); 1094 v9ses = v9fs_inode2v9ses(dir);
1107 if (!v9fs_extended(v9ses)) { 1095 if (!v9fs_proto_dotu(v9ses)) {
1108 P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n"); 1096 P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n");
1109 return -EPERM; 1097 return -EPERM;
1110 } 1098 }
@@ -1230,7 +1218,6 @@ static const struct inode_operations v9fs_dir_inode_operations_ext = {
1230 .rmdir = v9fs_vfs_rmdir, 1218 .rmdir = v9fs_vfs_rmdir,
1231 .mknod = v9fs_vfs_mknod, 1219 .mknod = v9fs_vfs_mknod,
1232 .rename = v9fs_vfs_rename, 1220 .rename = v9fs_vfs_rename,
1233 .readlink = v9fs_vfs_readlink,
1234 .getattr = v9fs_vfs_getattr, 1221 .getattr = v9fs_vfs_getattr,
1235 .setattr = v9fs_vfs_setattr, 1222 .setattr = v9fs_vfs_setattr,
1236}; 1223};
@@ -1253,7 +1240,7 @@ static const struct inode_operations v9fs_file_inode_operations = {
1253}; 1240};
1254 1241
1255static const struct inode_operations v9fs_symlink_inode_operations = { 1242static const struct inode_operations v9fs_symlink_inode_operations = {
1256 .readlink = v9fs_vfs_readlink, 1243 .readlink = generic_readlink,
1257 .follow_link = v9fs_vfs_follow_link, 1244 .follow_link = v9fs_vfs_follow_link,
1258 .put_link = v9fs_vfs_put_link, 1245 .put_link = v9fs_vfs_put_link,
1259 .getattr = v9fs_vfs_getattr, 1246 .getattr = v9fs_vfs_getattr,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 14a86448572c..806da5d3b3a0 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,6 +37,7 @@
37#include <linux/mount.h> 37#include <linux/mount.h>
38#include <linux/idr.h> 38#include <linux/idr.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/slab.h>
40#include <net/9p/9p.h> 41#include <net/9p/9p.h>
41#include <net/9p/client.h> 42#include <net/9p/client.h>
42 43
@@ -76,6 +77,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
76 sb->s_blocksize = 1 << sb->s_blocksize_bits; 77 sb->s_blocksize = 1 << sb->s_blocksize_bits;
77 sb->s_magic = V9FS_MAGIC; 78 sb->s_magic = V9FS_MAGIC;
78 sb->s_op = &v9fs_super_ops; 79 sb->s_op = &v9fs_super_ops;
80 sb->s_bdi = &v9ses->bdi;
79 81
80 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 82 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
81 MS_NOATIME; 83 MS_NOATIME;
@@ -188,10 +190,12 @@ static void v9fs_kill_super(struct super_block *s)
188 190
189 P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); 191 P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
190 192
191 v9fs_dentry_release(s->s_root); /* clunk root */ 193 if (s->s_root)
194 v9fs_dentry_release(s->s_root); /* clunk root */
192 195
193 kill_anon_super(s); 196 kill_anon_super(s);
194 197
198 v9fs_session_cancel(v9ses);
195 v9fs_session_close(v9ses); 199 v9fs_session_close(v9ses);
196 kfree(v9ses); 200 kfree(v9ses);
197 s->s_fs_info = NULL; 201 s->s_fs_info = NULL;
@@ -204,7 +208,7 @@ v9fs_umount_begin(struct super_block *sb)
204 struct v9fs_session_info *v9ses; 208 struct v9fs_session_info *v9ses;
205 209
206 v9ses = sb->s_fs_info; 210 v9ses = sb->s_fs_info;
207 v9fs_session_cancel(v9ses); 211 v9fs_session_begin_cancel(v9ses);
208} 212}
209 213
210static const struct super_operations v9fs_super_ops = { 214static const struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index 64d44efad7a5..5f85b5947613 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -177,6 +177,7 @@ source "fs/efs/Kconfig"
177source "fs/jffs2/Kconfig" 177source "fs/jffs2/Kconfig"
178# UBIFS File system configuration 178# UBIFS File system configuration
179source "fs/ubifs/Kconfig" 179source "fs/ubifs/Kconfig"
180source "fs/logfs/Kconfig"
180source "fs/cramfs/Kconfig" 181source "fs/cramfs/Kconfig"
181source "fs/squashfs/Kconfig" 182source "fs/squashfs/Kconfig"
182source "fs/freevxfs/Kconfig" 183source "fs/freevxfs/Kconfig"
@@ -234,6 +235,7 @@ config NFS_COMMON
234 235
235source "net/sunrpc/Kconfig" 236source "net/sunrpc/Kconfig"
236source "fs/smbfs/Kconfig" 237source "fs/smbfs/Kconfig"
238source "fs/ceph/Kconfig"
237source "fs/cifs/Kconfig" 239source "fs/cifs/Kconfig"
238source "fs/ncpfs/Kconfig" 240source "fs/ncpfs/Kconfig"
239source "fs/coda/Kconfig" 241source "fs/coda/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index af6d04700d9c..97f340f14ba2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -99,6 +99,7 @@ obj-$(CONFIG_NTFS_FS) += ntfs/
99obj-$(CONFIG_UFS_FS) += ufs/ 99obj-$(CONFIG_UFS_FS) += ufs/
100obj-$(CONFIG_EFS_FS) += efs/ 100obj-$(CONFIG_EFS_FS) += efs/
101obj-$(CONFIG_JFFS2_FS) += jffs2/ 101obj-$(CONFIG_JFFS2_FS) += jffs2/
102obj-$(CONFIG_LOGFS) += logfs/
102obj-$(CONFIG_UBIFS_FS) += ubifs/ 103obj-$(CONFIG_UBIFS_FS) += ubifs/
103obj-$(CONFIG_AFFS_FS) += affs/ 104obj-$(CONFIG_AFFS_FS) += affs/
104obj-$(CONFIG_ROMFS_FS) += romfs/ 105obj-$(CONFIG_ROMFS_FS) += romfs/
@@ -124,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
124obj-$(CONFIG_BTRFS_FS) += btrfs/ 125obj-$(CONFIG_BTRFS_FS) += btrfs/
125obj-$(CONFIG_GFS2_FS) += gfs2/ 126obj-$(CONFIG_GFS2_FS) += gfs2/
126obj-$(CONFIG_EXOFS_FS) += exofs/ 127obj-$(CONFIG_EXOFS_FS) += exofs/
128obj-$(CONFIG_CEPH_FS) += ceph/
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 9cc18775b832..2ff622f6f547 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -121,7 +121,7 @@ struct adfs_discmap {
121 121
122/* Inode stuff */ 122/* Inode stuff */
123struct inode *adfs_iget(struct super_block *sb, struct object_info *obj); 123struct inode *adfs_iget(struct super_block *sb, struct object_info *obj);
124int adfs_write_inode(struct inode *inode,int unused); 124int adfs_write_inode(struct inode *inode, struct writeback_control *wbc);
125int adfs_notify_change(struct dentry *dentry, struct iattr *attr); 125int adfs_notify_change(struct dentry *dentry, struct iattr *attr);
126 126
127/* map.c */ 127/* map.c */
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 3f57ce4bee5d..0f5e30978135 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -9,6 +9,7 @@
9 */ 9 */
10#include <linux/smp_lock.h> 10#include <linux/smp_lock.h>
11#include <linux/buffer_head.h> 11#include <linux/buffer_head.h>
12#include <linux/writeback.h>
12#include "adfs.h" 13#include "adfs.h"
13 14
14/* 15/*
@@ -360,7 +361,7 @@ out:
360 * The adfs-specific inode data has already been updated by 361 * The adfs-specific inode data has already been updated by
361 * adfs_notify_change() 362 * adfs_notify_change()
362 */ 363 */
363int adfs_write_inode(struct inode *inode, int wait) 364int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
364{ 365{
365 struct super_block *sb = inode->i_sb; 366 struct super_block *sb = inode->i_sb;
366 struct object_info obj; 367 struct object_info obj;
@@ -375,7 +376,7 @@ int adfs_write_inode(struct inode *inode, int wait)
375 obj.attr = ADFS_I(inode)->attr; 376 obj.attr = ADFS_I(inode)->attr;
376 obj.size = inode->i_size; 377 obj.size = inode->i_size;
377 378
378 ret = adfs_dir_update(sb, &obj, wait); 379 ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
379 unlock_kernel(); 380 unlock_kernel();
380 return ret; 381 return ret;
381} 382}
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 6910a98bd73c..4a3af7075c1d 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -13,6 +13,7 @@
13#include <linux/parser.h> 13#include <linux/parser.h>
14#include <linux/mount.h> 14#include <linux/mount.h>
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/slab.h>
16#include <linux/smp_lock.h> 17#include <linux/smp_lock.h>
17#include <linux/statfs.h> 18#include <linux/statfs.h>
18#include "adfs.h" 19#include "adfs.h"
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index e511dc621a2e..861dae68ac12 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -106,8 +106,8 @@ struct affs_sb_info {
106 u32 s_last_bmap; 106 u32 s_last_bmap;
107 struct buffer_head *s_bmap_bh; 107 struct buffer_head *s_bmap_bh;
108 char *s_prefix; /* Prefix for volumes and assigns. */ 108 char *s_prefix; /* Prefix for volumes and assigns. */
109 int s_prefix_len; /* Length of prefix. */
110 char s_volume[32]; /* Volume prefix for absolute symlinks. */ 109 char s_volume[32]; /* Volume prefix for absolute symlinks. */
110 spinlock_t symlink_lock; /* protects the previous two */
111}; 111};
112 112
113#define SF_INTL 0x0001 /* International filesystem. */ 113#define SF_INTL 0x0001 /* International filesystem. */
@@ -175,7 +175,8 @@ extern void affs_delete_inode(struct inode *inode);
175extern void affs_clear_inode(struct inode *inode); 175extern void affs_clear_inode(struct inode *inode);
176extern struct inode *affs_iget(struct super_block *sb, 176extern struct inode *affs_iget(struct super_block *sb,
177 unsigned long ino); 177 unsigned long ino);
178extern int affs_write_inode(struct inode *inode, int); 178extern int affs_write_inode(struct inode *inode,
179 struct writeback_control *wbc);
179extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type); 180extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type);
180 181
181/* file.c */ 182/* file.c */
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index dc5ef14bdc1c..3e262711ae06 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -7,6 +7,7 @@
7 * block allocation, deallocation, calculation of free space. 7 * block allocation, deallocation, calculation of free space.
8 */ 8 */
9 9
10#include <linux/slab.h>
10#include "affs.h" 11#include "affs.h"
11 12
12/* This is, of course, shamelessly stolen from fs/minix */ 13/* This is, of course, shamelessly stolen from fs/minix */
@@ -128,7 +129,7 @@ err_range:
128/* 129/*
129 * Allocate a block in the given allocation zone. 130 * Allocate a block in the given allocation zone.
130 * Since we have to byte-swap the bitmap on little-endian 131 * Since we have to byte-swap the bitmap on little-endian
131 * machines, this is rather expensive. Therefor we will 132 * machines, this is rather expensive. Therefore we will
132 * preallocate up to 16 blocks from the same word, if 133 * preallocate up to 16 blocks from the same word, if
133 * possible. We are not doing preallocations in the 134 * possible. We are not doing preallocations in the
134 * header zone, though. 135 * header zone, though.
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 3c4ec7d864c4..f4b2a4ee4f91 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -10,6 +10,7 @@
10 * (C) 1991 Linus Torvalds - minix filesystem 10 * (C) 1991 Linus Torvalds - minix filesystem
11 */ 11 */
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/gfp.h>
13#include "affs.h" 14#include "affs.h"
14 15
15extern const struct inode_operations affs_symlink_inode_operations; 16extern const struct inode_operations affs_symlink_inode_operations;
@@ -166,7 +167,7 @@ bad_inode:
166} 167}
167 168
168int 169int
169affs_write_inode(struct inode *inode, int unused) 170affs_write_inode(struct inode *inode, struct writeback_control *wbc)
170{ 171{
171 struct super_block *sb = inode->i_sb; 172 struct super_block *sb = inode->i_sb;
172 struct buffer_head *bh; 173 struct buffer_head *bh;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 960d336ec694..d70bbbac6b7b 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -341,10 +341,13 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
341 p = (char *)AFFS_HEAD(bh)->table; 341 p = (char *)AFFS_HEAD(bh)->table;
342 lc = '/'; 342 lc = '/';
343 if (*symname == '/') { 343 if (*symname == '/') {
344 struct affs_sb_info *sbi = AFFS_SB(sb);
344 while (*symname == '/') 345 while (*symname == '/')
345 symname++; 346 symname++;
346 while (AFFS_SB(sb)->s_volume[i]) /* Cannot overflow */ 347 spin_lock(&sbi->symlink_lock);
347 *p++ = AFFS_SB(sb)->s_volume[i++]; 348 while (sbi->s_volume[i]) /* Cannot overflow */
349 *p++ = sbi->s_volume[i++];
350 spin_unlock(&sbi->symlink_lock);
348 } 351 }
349 while (i < maxlen && (c = *symname++)) { 352 while (i < maxlen && (c = *symname++)) {
350 if (c == '.' && lc == '/' && *symname == '.' && symname[1] == '/') { 353 if (c == '.' && lc == '/' && *symname == '.' && symname[1] == '/') {
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 104fdcb3a7fc..16a3e4765f68 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -17,6 +17,7 @@
17#include <linux/magic.h> 17#include <linux/magic.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
20#include <linux/slab.h>
20#include "affs.h" 21#include "affs.h"
21 22
22extern struct timezone sys_tz; 23extern struct timezone sys_tz;
@@ -203,7 +204,7 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
203 switch (token) { 204 switch (token) {
204 case Opt_bs: 205 case Opt_bs:
205 if (match_int(&args[0], &n)) 206 if (match_int(&args[0], &n))
206 return -EINVAL; 207 return 0;
207 if (n != 512 && n != 1024 && n != 2048 208 if (n != 512 && n != 1024 && n != 2048
208 && n != 4096) { 209 && n != 4096) {
209 printk ("AFFS: Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); 210 printk ("AFFS: Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
@@ -213,7 +214,7 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
213 break; 214 break;
214 case Opt_mode: 215 case Opt_mode:
215 if (match_octal(&args[0], &option)) 216 if (match_octal(&args[0], &option))
216 return 1; 217 return 0;
217 *mode = option & 0777; 218 *mode = option & 0777;
218 *mount_opts |= SF_SETMODE; 219 *mount_opts |= SF_SETMODE;
219 break; 220 break;
@@ -221,8 +222,6 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
221 *mount_opts |= SF_MUFS; 222 *mount_opts |= SF_MUFS;
222 break; 223 break;
223 case Opt_prefix: 224 case Opt_prefix:
224 /* Free any previous prefix */
225 kfree(*prefix);
226 *prefix = match_strdup(&args[0]); 225 *prefix = match_strdup(&args[0]);
227 if (!*prefix) 226 if (!*prefix)
228 return 0; 227 return 0;
@@ -233,21 +232,21 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
233 break; 232 break;
234 case Opt_reserved: 233 case Opt_reserved:
235 if (match_int(&args[0], reserved)) 234 if (match_int(&args[0], reserved))
236 return 1; 235 return 0;
237 break; 236 break;
238 case Opt_root: 237 case Opt_root:
239 if (match_int(&args[0], root)) 238 if (match_int(&args[0], root))
240 return 1; 239 return 0;
241 break; 240 break;
242 case Opt_setgid: 241 case Opt_setgid:
243 if (match_int(&args[0], &option)) 242 if (match_int(&args[0], &option))
244 return 1; 243 return 0;
245 *gid = option; 244 *gid = option;
246 *mount_opts |= SF_SETGID; 245 *mount_opts |= SF_SETGID;
247 break; 246 break;
248 case Opt_setuid: 247 case Opt_setuid:
249 if (match_int(&args[0], &option)) 248 if (match_int(&args[0], &option))
250 return -EINVAL; 249 return 0;
251 *uid = option; 250 *uid = option;
252 *mount_opts |= SF_SETUID; 251 *mount_opts |= SF_SETUID;
253 break; 252 break;
@@ -311,11 +310,14 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
311 return -ENOMEM; 310 return -ENOMEM;
312 sb->s_fs_info = sbi; 311 sb->s_fs_info = sbi;
313 mutex_init(&sbi->s_bmlock); 312 mutex_init(&sbi->s_bmlock);
313 spin_lock_init(&sbi->symlink_lock);
314 314
315 if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block, 315 if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
316 &blocksize,&sbi->s_prefix, 316 &blocksize,&sbi->s_prefix,
317 sbi->s_volume, &mount_flags)) { 317 sbi->s_volume, &mount_flags)) {
318 printk(KERN_ERR "AFFS: Error parsing options\n"); 318 printk(KERN_ERR "AFFS: Error parsing options\n");
319 kfree(sbi->s_prefix);
320 kfree(sbi);
319 return -EINVAL; 321 return -EINVAL;
320 } 322 }
321 /* N.B. after this point s_prefix must be released */ 323 /* N.B. after this point s_prefix must be released */
@@ -516,14 +518,18 @@ affs_remount(struct super_block *sb, int *flags, char *data)
516 unsigned long mount_flags; 518 unsigned long mount_flags;
517 int res = 0; 519 int res = 0;
518 char *new_opts = kstrdup(data, GFP_KERNEL); 520 char *new_opts = kstrdup(data, GFP_KERNEL);
521 char volume[32];
522 char *prefix = NULL;
519 523
520 pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data); 524 pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data);
521 525
522 *flags |= MS_NODIRATIME; 526 *flags |= MS_NODIRATIME;
523 527
528 memcpy(volume, sbi->s_volume, 32);
524 if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block, 529 if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block,
525 &blocksize, &sbi->s_prefix, sbi->s_volume, 530 &blocksize, &prefix, volume,
526 &mount_flags)) { 531 &mount_flags)) {
532 kfree(prefix);
527 kfree(new_opts); 533 kfree(new_opts);
528 return -EINVAL; 534 return -EINVAL;
529 } 535 }
@@ -534,6 +540,14 @@ affs_remount(struct super_block *sb, int *flags, char *data)
534 sbi->s_mode = mode; 540 sbi->s_mode = mode;
535 sbi->s_uid = uid; 541 sbi->s_uid = uid;
536 sbi->s_gid = gid; 542 sbi->s_gid = gid;
543 /* protect against readers */
544 spin_lock(&sbi->symlink_lock);
545 if (prefix) {
546 kfree(sbi->s_prefix);
547 sbi->s_prefix = prefix;
548 }
549 memcpy(sbi->s_volume, volume, 32);
550 spin_unlock(&sbi->symlink_lock);
537 551
538 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 552 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
539 unlock_kernel(); 553 unlock_kernel();
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index 41782539c907..ee00f08c4f53 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -20,7 +20,6 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
20 int i, j; 20 int i, j;
21 char c; 21 char c;
22 char lc; 22 char lc;
23 char *pf;
24 23
25 pr_debug("AFFS: follow_link(ino=%lu)\n",inode->i_ino); 24 pr_debug("AFFS: follow_link(ino=%lu)\n",inode->i_ino);
26 25
@@ -32,11 +31,15 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
32 j = 0; 31 j = 0;
33 lf = (struct slink_front *)bh->b_data; 32 lf = (struct slink_front *)bh->b_data;
34 lc = 0; 33 lc = 0;
35 pf = AFFS_SB(inode->i_sb)->s_prefix ? AFFS_SB(inode->i_sb)->s_prefix : "/";
36 34
37 if (strchr(lf->symname,':')) { /* Handle assign or volume name */ 35 if (strchr(lf->symname,':')) { /* Handle assign or volume name */
36 struct affs_sb_info *sbi = AFFS_SB(inode->i_sb);
37 char *pf;
38 spin_lock(&sbi->symlink_lock);
39 pf = sbi->s_prefix ? sbi->s_prefix : "/";
38 while (i < 1023 && (c = pf[i])) 40 while (i < 1023 && (c = pf[i]))
39 link[i++] = c; 41 link[i++] = c;
42 spin_unlock(&sbi->symlink_lock);
40 while (i < 1023 && lf->symname[j] != ':') 43 while (i < 1023 && lf->symname[j] != ':')
41 link[i++] = lf->symname[j++]; 44 link[i++] = lf->symname[j++];
42 if (i < 1023) 45 if (i < 1023)
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index e2b1d3f16519..0fb315dd4d2a 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -9,7 +9,6 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
13#include <linux/sched.h> 12#include <linux/sched.h>
14#include "internal.h" 13#include "internal.h"
15 14
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index eb765489164f..a3bcec75c54a 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/slab.h>
14#include <linux/sched.h> 15#include <linux/sched.h>
15#include <linux/ip.h> 16#include <linux/ip.h>
16#include "internal.h" 17#include "internal.h"
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 88067f36e5e7..adc1cb771b57 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/ctype.h> 17#include <linux/ctype.h>
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 39b301662f22..0df9bc2b724d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -12,10 +12,10 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/gfp.h>
19#include "internal.h" 19#include "internal.h"
20 20
21static int afs_readpage(struct file *file, struct page *page); 21static int afs_readpage(struct file *file, struct page *page);
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 023b95b0d9d7..4bd0218473a9 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/slab.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/circ_buf.h> 15#include <linux/circ_buf.h>
15#include "internal.h" 16#include "internal.h"
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index c048f0658751..d00b312e3110 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -16,7 +16,6 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/slab.h>
20#include <linux/fs.h> 19#include <linux/fs.h>
21#include <linux/pagemap.h> 20#include <linux/pagemap.h>
22#include <linux/sched.h> 21#include <linux/sched.h>
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6ece2a13bf71..a10f2582844f 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -19,6 +19,7 @@
19#include <linux/workqueue.h> 19#include <linux/workqueue.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/fscache.h> 21#include <linux/fscache.h>
22#include <linux/backing-dev.h>
22 23
23#include "afs.h" 24#include "afs.h"
24#include "afs_vl.h" 25#include "afs_vl.h"
@@ -313,6 +314,7 @@ struct afs_volume {
313 unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */ 314 unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */
314 struct afs_server *servers[8]; /* servers on which volume resides (ordered) */ 315 struct afs_server *servers[8]; /* servers on which volume resides (ordered) */
315 struct rw_semaphore server_sem; /* lock for accessing current server */ 316 struct rw_semaphore server_sem; /* lock for accessing current server */
317 struct backing_dev_info bdi;
316}; 318};
317 319
318/* 320/*
@@ -733,7 +735,6 @@ extern int afs_write_end(struct file *file, struct address_space *mapping,
733 struct page *page, void *fsdata); 735 struct page *page, void *fsdata);
734extern int afs_writepage(struct page *, struct writeback_control *); 736extern int afs_writepage(struct page *, struct writeback_control *);
735extern int afs_writepages(struct address_space *, struct writeback_control *); 737extern int afs_writepages(struct address_space *, struct writeback_control *);
736extern int afs_write_inode(struct inode *, int);
737extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); 738extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
738extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, 739extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
739 unsigned long, loff_t); 740 unsigned long, loff_t);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 5ffb570cd3a8..b3feddc4f7d6 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -12,11 +12,11 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/mount.h> 17#include <linux/mount.h>
19#include <linux/namei.h> 18#include <linux/namei.h>
19#include <linux/gfp.h>
20#include "internal.h" 20#include "internal.h"
21 21
22 22
@@ -138,9 +138,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
138{ 138{
139 struct afs_super_info *super; 139 struct afs_super_info *super;
140 struct vfsmount *mnt; 140 struct vfsmount *mnt;
141 struct page *page = NULL; 141 struct page *page;
142 size_t size; 142 size_t size;
143 char *buf, *devname = NULL, *options = NULL; 143 char *buf, *devname, *options;
144 int ret; 144 int ret;
145 145
146 _enter("{%s}", mntpt->d_name.name); 146 _enter("{%s}", mntpt->d_name.name);
@@ -150,22 +150,22 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
150 ret = -EINVAL; 150 ret = -EINVAL;
151 size = mntpt->d_inode->i_size; 151 size = mntpt->d_inode->i_size;
152 if (size > PAGE_SIZE - 1) 152 if (size > PAGE_SIZE - 1)
153 goto error; 153 goto error_no_devname;
154 154
155 ret = -ENOMEM; 155 ret = -ENOMEM;
156 devname = (char *) get_zeroed_page(GFP_KERNEL); 156 devname = (char *) get_zeroed_page(GFP_KERNEL);
157 if (!devname) 157 if (!devname)
158 goto error; 158 goto error_no_devname;
159 159
160 options = (char *) get_zeroed_page(GFP_KERNEL); 160 options = (char *) get_zeroed_page(GFP_KERNEL);
161 if (!options) 161 if (!options)
162 goto error; 162 goto error_no_options;
163 163
164 /* read the contents of the AFS special symlink */ 164 /* read the contents of the AFS special symlink */
165 page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL); 165 page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
166 if (IS_ERR(page)) { 166 if (IS_ERR(page)) {
167 ret = PTR_ERR(page); 167 ret = PTR_ERR(page);
168 goto error; 168 goto error_no_page;
169 } 169 }
170 170
171 ret = -EIO; 171 ret = -EIO;
@@ -196,12 +196,12 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
196 return mnt; 196 return mnt;
197 197
198error: 198error:
199 if (page) 199 page_cache_release(page);
200 page_cache_release(page); 200error_no_page:
201 if (devname) 201 free_page((unsigned long) options);
202 free_page((unsigned long) devname); 202error_no_options:
203 if (options) 203 free_page((unsigned long) devname);
204 free_page((unsigned long) options); 204error_no_devname:
205 _leave(" = %d", ret); 205 _leave(" = %d", ret);
206 return ERR_PTR(ret); 206 return ERR_PTR(ret);
207} 207}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index bde3f19c0995..67cf810e0fd6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -9,6 +9,7 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
12#include <net/sock.h> 13#include <net/sock.h>
13#include <net/af_rxrpc.h> 14#include <net/af_rxrpc.h>
14#include <rxrpc/packet.h> 15#include <rxrpc/packet.h>
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3ef504370034..bb4ed144d0e4 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -189,8 +189,9 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
189 if (!permits) 189 if (!permits)
190 goto out_unlock; 190 goto out_unlock;
191 191
192 memcpy(permits->permits, xpermits->permits, 192 if (xpermits)
193 count * sizeof(struct afs_permit)); 193 memcpy(permits->permits, xpermits->permits,
194 count * sizeof(struct afs_permit));
194 195
195 _debug("key %x access %x", 196 _debug("key %x access %x",
196 key_serial(key), vnode->status.caller_access); 197 key_serial(key), vnode->status.caller_access);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index e1ea1c240b6a..e932e5a3a0c1 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -48,7 +48,6 @@ struct file_system_type afs_fs_type = {
48static const struct super_operations afs_super_ops = { 48static const struct super_operations afs_super_ops = {
49 .statfs = afs_statfs, 49 .statfs = afs_statfs,
50 .alloc_inode = afs_alloc_inode, 50 .alloc_inode = afs_alloc_inode,
51 .write_inode = afs_write_inode,
52 .destroy_inode = afs_destroy_inode, 51 .destroy_inode = afs_destroy_inode,
53 .clear_inode = afs_clear_inode, 52 .clear_inode = afs_clear_inode,
54 .put_super = afs_put_super, 53 .put_super = afs_put_super,
@@ -312,6 +311,7 @@ static int afs_fill_super(struct super_block *sb, void *data)
312 sb->s_magic = AFS_FS_MAGIC; 311 sb->s_magic = AFS_FS_MAGIC;
313 sb->s_op = &afs_super_ops; 312 sb->s_op = &afs_super_ops;
314 sb->s_fs_info = as; 313 sb->s_fs_info = as;
314 sb->s_bdi = &as->volume->bdi;
315 315
316 /* allocate the root inode and dentry */ 316 /* allocate the root inode and dentry */
317 fid.vid = as->volume->vid; 317 fid.vid = as->volume->vid;
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 36c1306e09e0..340afd0cd182 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -9,6 +9,7 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/gfp.h>
12#include <linux/init.h> 13#include <linux/init.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include "internal.h" 15#include "internal.h"
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 6e689208def2..9ac260d1361d 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h>
14#include <linux/init.h> 15#include <linux/init.h>
15#include <linux/sched.h> 16#include <linux/sched.h>
16#include "internal.h" 17#include "internal.h"
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 2f05c4fc2a70..25cf4c3f4ff7 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/sched.h> 16#include <linux/sched.h>
18#include "internal.h" 17#include "internal.h"
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index a353e69e2391..401eeb21869f 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,6 +106,10 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
106 volume->cell = params->cell; 106 volume->cell = params->cell;
107 volume->vid = vlocation->vldb.vid[params->type]; 107 volume->vid = vlocation->vldb.vid[params->type];
108 108
109 ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY);
110 if (ret)
111 goto error_bdi;
112
109 init_rwsem(&volume->server_sem); 113 init_rwsem(&volume->server_sem);
110 114
111 /* look up all the applicable server records */ 115 /* look up all the applicable server records */
@@ -151,6 +155,8 @@ error:
151 return ERR_PTR(ret); 155 return ERR_PTR(ret);
152 156
153error_discard: 157error_discard:
158 bdi_destroy(&volume->bdi);
159error_bdi:
154 up_write(&params->cell->vl_sem); 160 up_write(&params->cell->vl_sem);
155 161
156 for (loop = volume->nservers - 1; loop >= 0; loop--) 162 for (loop = volume->nservers - 1; loop >= 0; loop--)
@@ -200,6 +206,7 @@ void afs_put_volume(struct afs_volume *volume)
200 for (loop = volume->nservers - 1; loop >= 0; loop--) 206 for (loop = volume->nservers - 1; loop >= 0; loop--)
201 afs_put_server(volume->servers[loop]); 207 afs_put_server(volume->servers[loop]);
202 208
209 bdi_destroy(&volume->bdi);
203 kfree(volume); 210 kfree(volume);
204 211
205 _leave(" [destroyed]"); 212 _leave(" [destroyed]");
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 5e15a21dbf9f..3bed54a294d4 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -585,27 +585,6 @@ int afs_writepages(struct address_space *mapping,
585} 585}
586 586
587/* 587/*
588 * write an inode back
589 */
590int afs_write_inode(struct inode *inode, int sync)
591{
592 struct afs_vnode *vnode = AFS_FS_I(inode);
593 int ret;
594
595 _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
596
597 ret = 0;
598 if (sync) {
599 ret = filemap_fdatawait(inode->i_mapping);
600 if (ret < 0)
601 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
602 }
603
604 _leave(" = %d", ret);
605 return ret;
606}
607
608/*
609 * completion of write to server 588 * completion of write to server
610 */ 589 */
611void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) 590void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 9f0bf13291e5..e4b75d6eda83 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -12,7 +12,6 @@
12#include <linux/file.h> 12#include <linux/file.h>
13#include <linux/poll.h> 13#include <linux/poll.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/slab.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/fs.h> 16#include <linux/fs.h>
18#include <linux/mount.h> 17#include <linux/mount.h>
@@ -209,6 +208,7 @@ static struct inode *anon_inode_mkinode(void)
209 inode->i_mode = S_IRUSR | S_IWUSR; 208 inode->i_mode = S_IRUSR | S_IWUSR;
210 inode->i_uid = current_fsuid(); 209 inode->i_uid = current_fsuid();
211 inode->i_gid = current_fsgid(); 210 inode->i_gid = current_fsgid();
211 inode->i_flags |= S_PRIVATE;
212 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 212 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
213 return inode; 213 return inode;
214} 214}
diff --git a/fs/attr.c b/fs/attr.c
index 96d394bdaddf..0815e93bb487 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -12,7 +12,6 @@
12#include <linux/capability.h> 12#include <linux/capability.h>
13#include <linux/fsnotify.h> 13#include <linux/fsnotify.h>
14#include <linux/fcntl.h> 14#include <linux/fcntl.h>
15#include <linux/quotaops.h>
16#include <linux/security.h> 15#include <linux/security.h>
17 16
18/* Taken over from the old code... */ 17/* Taken over from the old code... */
@@ -82,7 +81,7 @@ int inode_newsize_ok(const struct inode *inode, loff_t offset)
82 if (inode->i_size < offset) { 81 if (inode->i_size < offset) {
83 unsigned long limit; 82 unsigned long limit;
84 83
85 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 84 limit = rlimit(RLIMIT_FSIZE);
86 if (limit != RLIM_INFINITY && offset > limit) 85 if (limit != RLIM_INFINITY && offset > limit)
87 goto out_sig; 86 goto out_sig;
88 if (offset > inode->i_sb->s_maxbytes) 87 if (offset > inode->i_sb->s_maxbytes)
@@ -212,14 +211,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
212 error = inode->i_op->setattr(dentry, attr); 211 error = inode->i_op->setattr(dentry, attr);
213 } else { 212 } else {
214 error = inode_change_ok(inode, attr); 213 error = inode_change_ok(inode, attr);
215 if (!error) { 214 if (!error)
216 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 215 error = inode_setattr(inode, attr);
217 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid))
218 error = vfs_dq_transfer(inode, attr) ?
219 -EDQUOT : 0;
220 if (!error)
221 error = inode_setattr(inode, attr);
222 }
223 } 216 }
224 217
225 if (ia_valid & ATTR_SIZE) 218 if (ia_valid & ATTR_SIZE)
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 4a1401cea0a1..8713c7cfbc79 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -13,6 +13,7 @@
13#include <linux/capability.h> 13#include <linux/capability.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/stat.h> 15#include <linux/stat.h>
16#include <linux/slab.h>
16#include <linux/param.h> 17#include <linux/param.h>
17#include <linux/time.h> 18#include <linux/time.h>
18#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 0118d67221b2..3d283abf67d7 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -60,11 +60,6 @@ do { \
60 current->pid, __func__, ##args); \ 60 current->pid, __func__, ##args); \
61} while (0) 61} while (0)
62 62
63struct rehash_entry {
64 struct task_struct *task;
65 struct list_head list;
66};
67
68/* Unified info structure. This is pointed to by both the dentry and 63/* Unified info structure. This is pointed to by both the dentry and
69 inode structures. Each file in the filesystem has an instance of this 64 inode structures. Each file in the filesystem has an instance of this
70 structure. It holds a reference to the dentry, so dentries are never 65 structure. It holds a reference to the dentry, so dentries are never
@@ -81,7 +76,6 @@ struct autofs_info {
81 76
82 struct list_head active; 77 struct list_head active;
83 int active_count; 78 int active_count;
84 struct list_head rehash_list;
85 79
86 struct list_head expiring; 80 struct list_head expiring;
87 81
@@ -104,7 +98,6 @@ struct autofs_info {
104#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ 98#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
105#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */ 99#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */
106#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ 100#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */
107#define AUTOFS_INF_REHASH (1<<3) /* dentry in transit to ->lookup() */
108 101
109struct autofs_wait_queue { 102struct autofs_wait_queue {
110 wait_queue_head_t queue; 103 wait_queue_head_t queue;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 00bf8fcb245f..d29b7f6df862 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -22,6 +22,7 @@
22#include <linux/magic.h> 22#include <linux/magic.h>
23#include <linux/dcache.h> 23#include <linux/dcache.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/slab.h>
25 26
26#include "autofs_i.h" 27#include "autofs_i.h"
27 28
@@ -544,10 +545,9 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
544 goto out; 545 goto out;
545 devid = new_encode_dev(path.mnt->mnt_sb->s_dev); 546 devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
546 err = 0; 547 err = 0;
547 if (path.dentry->d_inode && 548 if (path.mnt->mnt_root == path.dentry) {
548 path.mnt->mnt_root == path.dentry) {
549 err = 1; 549 err = 1;
550 magic = path.dentry->d_inode->i_sb->s_magic; 550 magic = path.mnt->mnt_sb->s_magic;
551 } 551 }
552 } else { 552 } else {
553 dev_t dev = sbi->sb->s_dev; 553 dev_t dev = sbi->sb->s_dev;
@@ -560,10 +560,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
560 560
561 err = have_submounts(path.dentry); 561 err = have_submounts(path.dentry);
562 562
563 if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) { 563 if (follow_down(&path))
564 if (follow_down(&path)) 564 magic = path.mnt->mnt_sb->s_magic;
565 magic = path.mnt->mnt_sb->s_magic;
566 }
567 } 565 }
568 566
569 param->ismountpoint.out.devid = devid; 567 param->ismountpoint.out.devid = devid;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 74bc9aa6df31..a796c9417fb1 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -279,7 +279,6 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
279 root->d_mounted--; 279 root->d_mounted--;
280 } 280 }
281 ino->flags |= AUTOFS_INF_EXPIRING; 281 ino->flags |= AUTOFS_INF_EXPIRING;
282 autofs4_add_expiring(root);
283 init_completion(&ino->expire_complete); 282 init_completion(&ino->expire_complete);
284 spin_unlock(&sbi->fs_lock); 283 spin_unlock(&sbi->fs_lock);
285 return root; 284 return root;
@@ -407,7 +406,6 @@ found:
407 expired, (int)expired->d_name.len, expired->d_name.name); 406 expired, (int)expired->d_name.len, expired->d_name.name);
408 ino = autofs4_dentry_ino(expired); 407 ino = autofs4_dentry_ino(expired);
409 ino->flags |= AUTOFS_INF_EXPIRING; 408 ino->flags |= AUTOFS_INF_EXPIRING;
410 autofs4_add_expiring(expired);
411 init_completion(&ino->expire_complete); 409 init_completion(&ino->expire_complete);
412 spin_unlock(&sbi->fs_lock); 410 spin_unlock(&sbi->fs_lock);
413 spin_lock(&dcache_lock); 411 spin_lock(&dcache_lock);
@@ -435,7 +433,7 @@ int autofs4_expire_wait(struct dentry *dentry)
435 433
436 DPRINTK("expire done status=%d", status); 434 DPRINTK("expire done status=%d", status);
437 435
438 if (d_unhashed(dentry) && IS_DEADDIR(dentry->d_inode)) 436 if (d_unhashed(dentry))
439 return -EAGAIN; 437 return -EAGAIN;
440 438
441 return status; 439 return status;
@@ -475,7 +473,6 @@ int autofs4_expire_run(struct super_block *sb,
475 spin_lock(&sbi->fs_lock); 473 spin_lock(&sbi->fs_lock);
476 ino = autofs4_dentry_ino(dentry); 474 ino = autofs4_dentry_ino(dentry);
477 ino->flags &= ~AUTOFS_INF_EXPIRING; 475 ino->flags &= ~AUTOFS_INF_EXPIRING;
478 autofs4_del_expiring(dentry);
479 complete_all(&ino->expire_complete); 476 complete_all(&ino->expire_complete);
480 spin_unlock(&sbi->fs_lock); 477 spin_unlock(&sbi->fs_lock);
481 478
@@ -506,7 +503,6 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
506 ino->flags &= ~AUTOFS_INF_MOUNTPOINT; 503 ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
507 } 504 }
508 ino->flags &= ~AUTOFS_INF_EXPIRING; 505 ino->flags &= ~AUTOFS_INF_EXPIRING;
509 autofs4_del_expiring(dentry);
510 complete_all(&ino->expire_complete); 506 complete_all(&ino->expire_complete);
511 spin_unlock(&sbi->fs_lock); 507 spin_unlock(&sbi->fs_lock);
512 dput(dentry); 508 dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index d0a3de247458..821b2b955dac 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -49,7 +49,6 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
49 ino->dentry = NULL; 49 ino->dentry = NULL;
50 ino->size = 0; 50 ino->size = 0;
51 INIT_LIST_HEAD(&ino->active); 51 INIT_LIST_HEAD(&ino->active);
52 INIT_LIST_HEAD(&ino->rehash_list);
53 ino->active_count = 0; 52 ino->active_count = 0;
54 INIT_LIST_HEAD(&ino->expiring); 53 INIT_LIST_HEAD(&ino->expiring);
55 atomic_set(&ino->count, 0); 54 atomic_set(&ino->count, 0);
@@ -97,63 +96,6 @@ void autofs4_free_ino(struct autofs_info *ino)
97 kfree(ino); 96 kfree(ino);
98} 97}
99 98
100/*
101 * Deal with the infamous "Busy inodes after umount ..." message.
102 *
103 * Clean up the dentry tree. This happens with autofs if the user
104 * space program goes away due to a SIGKILL, SIGSEGV etc.
105 */
106static void autofs4_force_release(struct autofs_sb_info *sbi)
107{
108 struct dentry *this_parent = sbi->sb->s_root;
109 struct list_head *next;
110
111 if (!sbi->sb->s_root)
112 return;
113
114 spin_lock(&dcache_lock);
115repeat:
116 next = this_parent->d_subdirs.next;
117resume:
118 while (next != &this_parent->d_subdirs) {
119 struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
120
121 /* Negative dentry - don`t care */
122 if (!simple_positive(dentry)) {
123 next = next->next;
124 continue;
125 }
126
127 if (!list_empty(&dentry->d_subdirs)) {
128 this_parent = dentry;
129 goto repeat;
130 }
131
132 next = next->next;
133 spin_unlock(&dcache_lock);
134
135 DPRINTK("dentry %p %.*s",
136 dentry, (int)dentry->d_name.len, dentry->d_name.name);
137
138 dput(dentry);
139 spin_lock(&dcache_lock);
140 }
141
142 if (this_parent != sbi->sb->s_root) {
143 struct dentry *dentry = this_parent;
144
145 next = this_parent->d_u.d_child.next;
146 this_parent = this_parent->d_parent;
147 spin_unlock(&dcache_lock);
148 DPRINTK("parent dentry %p %.*s",
149 dentry, (int)dentry->d_name.len, dentry->d_name.name);
150 dput(dentry);
151 spin_lock(&dcache_lock);
152 goto resume;
153 }
154 spin_unlock(&dcache_lock);
155}
156
157void autofs4_kill_sb(struct super_block *sb) 99void autofs4_kill_sb(struct super_block *sb)
158{ 100{
159 struct autofs_sb_info *sbi = autofs4_sbi(sb); 101 struct autofs_sb_info *sbi = autofs4_sbi(sb);
@@ -170,15 +112,12 @@ void autofs4_kill_sb(struct super_block *sb)
170 /* Free wait queues, close pipe */ 112 /* Free wait queues, close pipe */
171 autofs4_catatonic_mode(sbi); 113 autofs4_catatonic_mode(sbi);
172 114
173 /* Clean up and release dangling references */
174 autofs4_force_release(sbi);
175
176 sb->s_fs_info = NULL; 115 sb->s_fs_info = NULL;
177 kfree(sbi); 116 kfree(sbi);
178 117
179out_kill_sb: 118out_kill_sb:
180 DPRINTK("shutting down"); 119 DPRINTK("shutting down");
181 kill_anon_super(sb); 120 kill_litter_super(sb);
182} 121}
183 122
184static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) 123static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 30cc9ddf4b70..109a6c606d92 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -15,6 +15,7 @@
15#include <linux/capability.h> 15#include <linux/capability.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/stat.h> 17#include <linux/stat.h>
18#include <linux/slab.h>
18#include <linux/param.h> 19#include <linux/param.h>
19#include <linux/time.h> 20#include <linux/time.h>
20#include "autofs_i.h" 21#include "autofs_i.h"
@@ -104,99 +105,6 @@ static void autofs4_del_active(struct dentry *dentry)
104 return; 105 return;
105} 106}
106 107
107static void autofs4_add_rehash_entry(struct autofs_info *ino,
108 struct rehash_entry *entry)
109{
110 entry->task = current;
111 INIT_LIST_HEAD(&entry->list);
112 list_add(&entry->list, &ino->rehash_list);
113 return;
114}
115
116static void autofs4_remove_rehash_entry(struct autofs_info *ino)
117{
118 struct list_head *head = &ino->rehash_list;
119 struct rehash_entry *entry;
120 list_for_each_entry(entry, head, list) {
121 if (entry->task == current) {
122 list_del(&entry->list);
123 kfree(entry);
124 break;
125 }
126 }
127 return;
128}
129
130static void autofs4_remove_rehash_entrys(struct autofs_info *ino)
131{
132 struct autofs_sb_info *sbi = ino->sbi;
133 struct rehash_entry *entry, *next;
134 struct list_head *head;
135
136 spin_lock(&sbi->fs_lock);
137 spin_lock(&sbi->lookup_lock);
138 if (!(ino->flags & AUTOFS_INF_REHASH)) {
139 spin_unlock(&sbi->lookup_lock);
140 spin_unlock(&sbi->fs_lock);
141 return;
142 }
143 ino->flags &= ~AUTOFS_INF_REHASH;
144 head = &ino->rehash_list;
145 list_for_each_entry_safe(entry, next, head, list) {
146 list_del(&entry->list);
147 kfree(entry);
148 }
149 spin_unlock(&sbi->lookup_lock);
150 spin_unlock(&sbi->fs_lock);
151 dput(ino->dentry);
152
153 return;
154}
155
156static void autofs4_revalidate_drop(struct dentry *dentry,
157 struct rehash_entry *entry)
158{
159 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
160 struct autofs_info *ino = autofs4_dentry_ino(dentry);
161 /*
162 * Add to the active list so we can pick this up in
163 * ->lookup(). Also add an entry to a rehash list so
164 * we know when there are no dentrys in flight so we
165 * know when we can rehash the dentry.
166 */
167 spin_lock(&sbi->lookup_lock);
168 if (list_empty(&ino->active))
169 list_add(&ino->active, &sbi->active_list);
170 autofs4_add_rehash_entry(ino, entry);
171 spin_unlock(&sbi->lookup_lock);
172 if (!(ino->flags & AUTOFS_INF_REHASH)) {
173 ino->flags |= AUTOFS_INF_REHASH;
174 dget(dentry);
175 spin_lock(&dentry->d_lock);
176 __d_drop(dentry);
177 spin_unlock(&dentry->d_lock);
178 }
179 return;
180}
181
182static void autofs4_revalidate_rehash(struct dentry *dentry)
183{
184 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
185 struct autofs_info *ino = autofs4_dentry_ino(dentry);
186 if (ino->flags & AUTOFS_INF_REHASH) {
187 spin_lock(&sbi->lookup_lock);
188 autofs4_remove_rehash_entry(ino);
189 if (list_empty(&ino->rehash_list)) {
190 spin_unlock(&sbi->lookup_lock);
191 ino->flags &= ~AUTOFS_INF_REHASH;
192 d_rehash(dentry);
193 dput(ino->dentry);
194 } else
195 spin_unlock(&sbi->lookup_lock);
196 }
197 return;
198}
199
200static unsigned int autofs4_need_mount(unsigned int flags) 108static unsigned int autofs4_need_mount(unsigned int flags)
201{ 109{
202 unsigned int res = 0; 110 unsigned int res = 0;
@@ -236,7 +144,7 @@ out:
236 return dcache_dir_open(inode, file); 144 return dcache_dir_open(inode, file);
237} 145}
238 146
239static int try_to_fill_dentry(struct dentry *dentry) 147static int try_to_fill_dentry(struct dentry *dentry, int flags)
240{ 148{
241 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 149 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
242 struct autofs_info *ino = autofs4_dentry_ino(dentry); 150 struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -249,17 +157,55 @@ static int try_to_fill_dentry(struct dentry *dentry)
249 * Wait for a pending mount, triggering one if there 157 * Wait for a pending mount, triggering one if there
250 * isn't one already 158 * isn't one already
251 */ 159 */
252 DPRINTK("waiting for mount name=%.*s", 160 if (dentry->d_inode == NULL) {
253 dentry->d_name.len, dentry->d_name.name); 161 DPRINTK("waiting for mount name=%.*s",
162 dentry->d_name.len, dentry->d_name.name);
254 163
255 status = autofs4_wait(sbi, dentry, NFY_MOUNT); 164 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
256 165
257 DPRINTK("mount done status=%d", status); 166 DPRINTK("mount done status=%d", status);
258 167
259 /* Update expiry counter */ 168 /* Turn this into a real negative dentry? */
260 ino->last_used = jiffies; 169 if (status == -ENOENT) {
170 spin_lock(&sbi->fs_lock);
171 ino->flags &= ~AUTOFS_INF_PENDING;
172 spin_unlock(&sbi->fs_lock);
173 return status;
174 } else if (status) {
175 /* Return a negative dentry, but leave it "pending" */
176 return status;
177 }
178 /* Trigger mount for path component or follow link */
179 } else if (ino->flags & AUTOFS_INF_PENDING ||
180 autofs4_need_mount(flags) ||
181 current->link_count) {
182 DPRINTK("waiting for mount name=%.*s",
183 dentry->d_name.len, dentry->d_name.name);
261 184
262 return status; 185 spin_lock(&sbi->fs_lock);
186 ino->flags |= AUTOFS_INF_PENDING;
187 spin_unlock(&sbi->fs_lock);
188 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
189
190 DPRINTK("mount done status=%d", status);
191
192 if (status) {
193 spin_lock(&sbi->fs_lock);
194 ino->flags &= ~AUTOFS_INF_PENDING;
195 spin_unlock(&sbi->fs_lock);
196 return status;
197 }
198 }
199
200 /* Initialize expiry counter after successful mount */
201 if (ino)
202 ino->last_used = jiffies;
203
204 spin_lock(&sbi->fs_lock);
205 ino->flags &= ~AUTOFS_INF_PENDING;
206 spin_unlock(&sbi->fs_lock);
207
208 return 0;
263} 209}
264 210
265/* For autofs direct mounts the follow link triggers the mount */ 211/* For autofs direct mounts the follow link triggers the mount */
@@ -313,16 +259,10 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
313 */ 259 */
314 if (ino->flags & AUTOFS_INF_PENDING || 260 if (ino->flags & AUTOFS_INF_PENDING ||
315 (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) { 261 (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
316 ino->flags |= AUTOFS_INF_PENDING;
317 spin_unlock(&dcache_lock); 262 spin_unlock(&dcache_lock);
318 spin_unlock(&sbi->fs_lock); 263 spin_unlock(&sbi->fs_lock);
319 264
320 status = try_to_fill_dentry(dentry); 265 status = try_to_fill_dentry(dentry, 0);
321
322 spin_lock(&sbi->fs_lock);
323 ino->flags &= ~AUTOFS_INF_PENDING;
324 spin_unlock(&sbi->fs_lock);
325
326 if (status) 266 if (status)
327 goto out_error; 267 goto out_error;
328 268
@@ -361,47 +301,18 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
361{ 301{
362 struct inode *dir = dentry->d_parent->d_inode; 302 struct inode *dir = dentry->d_parent->d_inode;
363 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); 303 struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
364 struct autofs_info *ino = autofs4_dentry_ino(dentry); 304 int oz_mode = autofs4_oz_mode(sbi);
365 struct rehash_entry *entry;
366 int flags = nd ? nd->flags : 0; 305 int flags = nd ? nd->flags : 0;
367 unsigned int mutex_aquired; 306 int status = 1;
368 307
369 DPRINTK("name = %.*s oz_mode = %d",
370 dentry->d_name.len, dentry->d_name.name, oz_mode);
371
372 /* Daemon never causes a mount to trigger */
373 if (autofs4_oz_mode(sbi))
374 return 1;
375
376 entry = kmalloc(sizeof(struct rehash_entry), GFP_KERNEL);
377 if (!entry)
378 return -ENOMEM;
379
380 mutex_aquired = mutex_trylock(&dir->i_mutex);
381
382 spin_lock(&sbi->fs_lock);
383 spin_lock(&dcache_lock);
384 /* Pending dentry */ 308 /* Pending dentry */
309 spin_lock(&sbi->fs_lock);
385 if (autofs4_ispending(dentry)) { 310 if (autofs4_ispending(dentry)) {
386 int status; 311 /* The daemon never causes a mount to trigger */
387
388 /*
389 * We can only unhash and send this to ->lookup() if
390 * the directory mutex is held over d_revalidate() and
391 * ->lookup(). This prevents the VFS from incorrectly
392 * seeing the dentry as non-existent.
393 */
394 ino->flags |= AUTOFS_INF_PENDING;
395 if (!mutex_aquired) {
396 autofs4_revalidate_drop(dentry, entry);
397 spin_unlock(&dcache_lock);
398 spin_unlock(&sbi->fs_lock);
399 return 0;
400 }
401 spin_unlock(&dcache_lock);
402 spin_unlock(&sbi->fs_lock); 312 spin_unlock(&sbi->fs_lock);
403 mutex_unlock(&dir->i_mutex); 313
404 kfree(entry); 314 if (oz_mode)
315 return 1;
405 316
406 /* 317 /*
407 * If the directory has gone away due to an expire 318 * If the directory has gone away due to an expire
@@ -415,82 +326,45 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
415 * A zero status is success otherwise we have a 326 * A zero status is success otherwise we have a
416 * negative error code. 327 * negative error code.
417 */ 328 */
418 status = try_to_fill_dentry(dentry); 329 status = try_to_fill_dentry(dentry, flags);
419
420 spin_lock(&sbi->fs_lock);
421 ino->flags &= ~AUTOFS_INF_PENDING;
422 spin_unlock(&sbi->fs_lock);
423
424 if (status == 0) 330 if (status == 0)
425 return 1; 331 return 1;
426 332
427 return status; 333 return status;
428 } 334 }
335 spin_unlock(&sbi->fs_lock);
336
337 /* Negative dentry.. invalidate if "old" */
338 if (dentry->d_inode == NULL)
339 return 0;
429 340
430 /* Check for a non-mountpoint directory with no contents */ 341 /* Check for a non-mountpoint directory with no contents */
342 spin_lock(&dcache_lock);
431 if (S_ISDIR(dentry->d_inode->i_mode) && 343 if (S_ISDIR(dentry->d_inode->i_mode) &&
432 !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { 344 !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
433 DPRINTK("dentry=%p %.*s, emptydir", 345 DPRINTK("dentry=%p %.*s, emptydir",
434 dentry, dentry->d_name.len, dentry->d_name.name); 346 dentry, dentry->d_name.len, dentry->d_name.name);
347 spin_unlock(&dcache_lock);
435 348
436 if (autofs4_need_mount(flags) || current->link_count) { 349 /* The daemon never causes a mount to trigger */
437 int status; 350 if (oz_mode)
438 351 return 1;
439 /*
440 * We can only unhash and send this to ->lookup() if
441 * the directory mutex is held over d_revalidate() and
442 * ->lookup(). This prevents the VFS from incorrectly
443 * seeing the dentry as non-existent.
444 */
445 ino->flags |= AUTOFS_INF_PENDING;
446 if (!mutex_aquired) {
447 autofs4_revalidate_drop(dentry, entry);
448 spin_unlock(&dcache_lock);
449 spin_unlock(&sbi->fs_lock);
450 return 0;
451 }
452 spin_unlock(&dcache_lock);
453 spin_unlock(&sbi->fs_lock);
454 mutex_unlock(&dir->i_mutex);
455 kfree(entry);
456
457 /*
458 * A zero status is success otherwise we have a
459 * negative error code.
460 */
461 status = try_to_fill_dentry(dentry);
462
463 spin_lock(&sbi->fs_lock);
464 ino->flags &= ~AUTOFS_INF_PENDING;
465 spin_unlock(&sbi->fs_lock);
466 352
467 if (status == 0) 353 /*
468 return 1; 354 * A zero status is success otherwise we have a
355 * negative error code.
356 */
357 status = try_to_fill_dentry(dentry, flags);
358 if (status == 0)
359 return 1;
469 360
470 return status; 361 return status;
471 }
472 } 362 }
473 spin_unlock(&dcache_lock); 363 spin_unlock(&dcache_lock);
474 spin_unlock(&sbi->fs_lock);
475
476 if (mutex_aquired)
477 mutex_unlock(&dir->i_mutex);
478
479 kfree(entry);
480 364
481 return 1; 365 return 1;
482} 366}
483 367
484static void autofs4_free_rehash_entrys(struct autofs_info *inf)
485{
486 struct list_head *head = &inf->rehash_list;
487 struct rehash_entry *entry, *next;
488 list_for_each_entry_safe(entry, next, head, list) {
489 list_del(&entry->list);
490 kfree(entry);
491 }
492}
493
494void autofs4_dentry_release(struct dentry *de) 368void autofs4_dentry_release(struct dentry *de)
495{ 369{
496 struct autofs_info *inf; 370 struct autofs_info *inf;
@@ -509,8 +383,6 @@ void autofs4_dentry_release(struct dentry *de)
509 list_del(&inf->active); 383 list_del(&inf->active);
510 if (!list_empty(&inf->expiring)) 384 if (!list_empty(&inf->expiring))
511 list_del(&inf->expiring); 385 list_del(&inf->expiring);
512 if (!list_empty(&inf->rehash_list))
513 autofs4_free_rehash_entrys(inf);
514 spin_unlock(&sbi->lookup_lock); 386 spin_unlock(&sbi->lookup_lock);
515 } 387 }
516 388
@@ -543,7 +415,6 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
543 const unsigned char *str = name->name; 415 const unsigned char *str = name->name;
544 struct list_head *p, *head; 416 struct list_head *p, *head;
545 417
546restart:
547 spin_lock(&dcache_lock); 418 spin_lock(&dcache_lock);
548 spin_lock(&sbi->lookup_lock); 419 spin_lock(&sbi->lookup_lock);
549 head = &sbi->active_list; 420 head = &sbi->active_list;
@@ -561,19 +432,6 @@ restart:
561 if (atomic_read(&active->d_count) == 0) 432 if (atomic_read(&active->d_count) == 0)
562 goto next; 433 goto next;
563 434
564 if (active->d_inode && IS_DEADDIR(active->d_inode)) {
565 if (!list_empty(&ino->rehash_list)) {
566 dget(active);
567 spin_unlock(&active->d_lock);
568 spin_unlock(&sbi->lookup_lock);
569 spin_unlock(&dcache_lock);
570 autofs4_remove_rehash_entrys(ino);
571 dput(active);
572 goto restart;
573 }
574 goto next;
575 }
576
577 qstr = &active->d_name; 435 qstr = &active->d_name;
578 436
579 if (active->d_name.hash != hash) 437 if (active->d_name.hash != hash)
@@ -586,11 +444,13 @@ restart:
586 if (memcmp(qstr->name, str, len)) 444 if (memcmp(qstr->name, str, len))
587 goto next; 445 goto next;
588 446
589 dget(active); 447 if (d_unhashed(active)) {
590 spin_unlock(&active->d_lock); 448 dget(active);
591 spin_unlock(&sbi->lookup_lock); 449 spin_unlock(&active->d_lock);
592 spin_unlock(&dcache_lock); 450 spin_unlock(&sbi->lookup_lock);
593 return active; 451 spin_unlock(&dcache_lock);
452 return active;
453 }
594next: 454next:
595 spin_unlock(&active->d_lock); 455 spin_unlock(&active->d_lock);
596 } 456 }
@@ -639,11 +499,13 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
639 if (memcmp(qstr->name, str, len)) 499 if (memcmp(qstr->name, str, len))
640 goto next; 500 goto next;
641 501
642 dget(expiring); 502 if (d_unhashed(expiring)) {
643 spin_unlock(&expiring->d_lock); 503 dget(expiring);
644 spin_unlock(&sbi->lookup_lock); 504 spin_unlock(&expiring->d_lock);
645 spin_unlock(&dcache_lock); 505 spin_unlock(&sbi->lookup_lock);
646 return expiring; 506 spin_unlock(&dcache_lock);
507 return expiring;
508 }
647next: 509next:
648 spin_unlock(&expiring->d_lock); 510 spin_unlock(&expiring->d_lock);
649 } 511 }
@@ -653,48 +515,6 @@ next:
653 return NULL; 515 return NULL;
654} 516}
655 517
656static struct autofs_info *init_new_dentry(struct autofs_sb_info *sbi,
657 struct dentry *dentry, int oz_mode)
658{
659 struct autofs_info *ino;
660
661 /*
662 * Mark the dentry incomplete but don't hash it. We do this
663 * to serialize our inode creation operations (symlink and
664 * mkdir) which prevents deadlock during the callback to
665 * the daemon. Subsequent user space lookups for the same
666 * dentry are placed on the wait queue while the daemon
667 * itself is allowed passage unresticted so the create
668 * operation itself can then hash the dentry. Finally,
669 * we check for the hashed dentry and return the newly
670 * hashed dentry.
671 */
672 dentry->d_op = &autofs4_root_dentry_operations;
673
674 /*
675 * And we need to ensure that the same dentry is used for
676 * all following lookup calls until it is hashed so that
677 * the dentry flags are persistent throughout the request.
678 */
679 ino = autofs4_init_ino(NULL, sbi, 0555);
680 if (!ino)
681 return ERR_PTR(-ENOMEM);
682
683 dentry->d_fsdata = ino;
684 ino->dentry = dentry;
685
686 /*
687 * Only set the mount pending flag for new dentrys not created
688 * by the daemon.
689 */
690 if (!oz_mode)
691 ino->flags |= AUTOFS_INF_PENDING;
692
693 d_instantiate(dentry, NULL);
694
695 return ino;
696}
697
698/* Lookups in the root directory */ 518/* Lookups in the root directory */
699static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 519static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
700{ 520{
@@ -702,7 +522,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
702 struct autofs_info *ino; 522 struct autofs_info *ino;
703 struct dentry *expiring, *active; 523 struct dentry *expiring, *active;
704 int oz_mode; 524 int oz_mode;
705 int status = 0;
706 525
707 DPRINTK("name = %.*s", 526 DPRINTK("name = %.*s",
708 dentry->d_name.len, dentry->d_name.name); 527 dentry->d_name.len, dentry->d_name.name);
@@ -717,26 +536,44 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
717 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", 536 DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
718 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); 537 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
719 538
720 spin_lock(&sbi->fs_lock);
721 active = autofs4_lookup_active(dentry); 539 active = autofs4_lookup_active(dentry);
722 if (active) { 540 if (active) {
723 dentry = active; 541 dentry = active;
724 ino = autofs4_dentry_ino(dentry); 542 ino = autofs4_dentry_ino(dentry);
725 /* If this came from revalidate, rehash it */
726 autofs4_revalidate_rehash(dentry);
727 spin_unlock(&sbi->fs_lock);
728 } else { 543 } else {
729 spin_unlock(&sbi->fs_lock); 544 /*
730 ino = init_new_dentry(sbi, dentry, oz_mode); 545 * Mark the dentry incomplete but don't hash it. We do this
731 if (IS_ERR(ino)) 546 * to serialize our inode creation operations (symlink and
732 return (struct dentry *) ino; 547 * mkdir) which prevents deadlock during the callback to
733 } 548 * the daemon. Subsequent user space lookups for the same
549 * dentry are placed on the wait queue while the daemon
550 * itself is allowed passage unresticted so the create
551 * operation itself can then hash the dentry. Finally,
552 * we check for the hashed dentry and return the newly
553 * hashed dentry.
554 */
555 dentry->d_op = &autofs4_root_dentry_operations;
556
557 /*
558 * And we need to ensure that the same dentry is used for
559 * all following lookup calls until it is hashed so that
560 * the dentry flags are persistent throughout the request.
561 */
562 ino = autofs4_init_ino(NULL, sbi, 0555);
563 if (!ino)
564 return ERR_PTR(-ENOMEM);
734 565
735 autofs4_add_active(dentry); 566 dentry->d_fsdata = ino;
567 ino->dentry = dentry;
568
569 autofs4_add_active(dentry);
570
571 d_instantiate(dentry, NULL);
572 }
736 573
737 if (!oz_mode) { 574 if (!oz_mode) {
738 expiring = autofs4_lookup_expiring(dentry);
739 mutex_unlock(&dir->i_mutex); 575 mutex_unlock(&dir->i_mutex);
576 expiring = autofs4_lookup_expiring(dentry);
740 if (expiring) { 577 if (expiring) {
741 /* 578 /*
742 * If we are racing with expire the request might not 579 * If we are racing with expire the request might not
@@ -744,22 +581,23 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
744 * so it must have been successful, so just wait for it. 581 * so it must have been successful, so just wait for it.
745 */ 582 */
746 autofs4_expire_wait(expiring); 583 autofs4_expire_wait(expiring);
584 autofs4_del_expiring(expiring);
747 dput(expiring); 585 dput(expiring);
748 } 586 }
749 status = try_to_fill_dentry(dentry); 587
750 mutex_lock(&dir->i_mutex);
751 spin_lock(&sbi->fs_lock); 588 spin_lock(&sbi->fs_lock);
752 ino->flags &= ~AUTOFS_INF_PENDING; 589 ino->flags |= AUTOFS_INF_PENDING;
753 spin_unlock(&sbi->fs_lock); 590 spin_unlock(&sbi->fs_lock);
591 if (dentry->d_op && dentry->d_op->d_revalidate)
592 (dentry->d_op->d_revalidate)(dentry, nd);
593 mutex_lock(&dir->i_mutex);
754 } 594 }
755 595
756 autofs4_del_active(dentry);
757
758 /* 596 /*
759 * If we had a mount fail, check if we had to handle 597 * If we are still pending, check if we had to handle
760 * a signal. If so we can force a restart.. 598 * a signal. If so we can force a restart..
761 */ 599 */
762 if (status) { 600 if (ino->flags & AUTOFS_INF_PENDING) {
763 /* See if we were interrupted */ 601 /* See if we were interrupted */
764 if (signal_pending(current)) { 602 if (signal_pending(current)) {
765 sigset_t *sigset = &current->pending.signal; 603 sigset_t *sigset = &current->pending.signal;
@@ -771,46 +609,43 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
771 return ERR_PTR(-ERESTARTNOINTR); 609 return ERR_PTR(-ERESTARTNOINTR);
772 } 610 }
773 } 611 }
774 } 612 if (!oz_mode) {
775 613 spin_lock(&sbi->fs_lock);
776 /* 614 ino->flags &= ~AUTOFS_INF_PENDING;
777 * User space can (and has done in the past) remove and re-create 615 spin_unlock(&sbi->fs_lock);
778 * this directory during the callback. This can leave us with an
779 * unhashed dentry, but a successful mount! So we need to
780 * perform another cached lookup in case the dentry now exists.
781 */
782 if (!oz_mode && !have_submounts(dentry)) {
783 struct dentry *new;
784 new = d_lookup(dentry->d_parent, &dentry->d_name);
785 if (new) {
786 if (active)
787 dput(active);
788 return new;
789 } else {
790 if (!status)
791 status = -ENOENT;
792 } 616 }
793 } 617 }
794 618
795 /* 619 /*
796 * If we had a mount failure, return status to user space. 620 * If this dentry is unhashed, then we shouldn't honour this
797 * If the mount succeeded and we used a dentry from the active queue 621 * lookup. Returning ENOENT here doesn't do the right thing
798 * return it. 622 * for all system calls, but it should be OK for the operations
623 * we permit from an autofs.
799 */ 624 */
800 if (status) { 625 if (!oz_mode && d_unhashed(dentry)) {
801 dentry = ERR_PTR(status);
802 if (active)
803 dput(active);
804 return dentry;
805 } else {
806 /* 626 /*
807 * Valid successful mount, return active dentry or NULL 627 * A user space application can (and has done in the past)
808 * for a new dentry. 628 * remove and re-create this directory during the callback.
629 * This can leave us with an unhashed dentry, but a
630 * successful mount! So we need to perform another
631 * cached lookup in case the dentry now exists.
809 */ 632 */
633 struct dentry *parent = dentry->d_parent;
634 struct dentry *new = d_lookup(parent, &dentry->d_name);
635 if (new != NULL)
636 dentry = new;
637 else
638 dentry = ERR_PTR(-ENOENT);
639
810 if (active) 640 if (active)
811 return active; 641 dput(active);
642
643 return dentry;
812 } 644 }
813 645
646 if (active)
647 return active;
648
814 return NULL; 649 return NULL;
815} 650}
816 651
@@ -834,6 +669,8 @@ static int autofs4_dir_symlink(struct inode *dir,
834 if (!ino) 669 if (!ino)
835 return -ENOMEM; 670 return -ENOMEM;
836 671
672 autofs4_del_active(dentry);
673
837 ino->size = strlen(symname); 674 ino->size = strlen(symname);
838 cp = kmalloc(ino->size + 1, GFP_KERNEL); 675 cp = kmalloc(ino->size + 1, GFP_KERNEL);
839 if (!cp) { 676 if (!cp) {
@@ -910,6 +747,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
910 dir->i_mtime = CURRENT_TIME; 747 dir->i_mtime = CURRENT_TIME;
911 748
912 spin_lock(&dcache_lock); 749 spin_lock(&dcache_lock);
750 autofs4_add_expiring(dentry);
913 spin_lock(&dentry->d_lock); 751 spin_lock(&dentry->d_lock);
914 __d_drop(dentry); 752 __d_drop(dentry);
915 spin_unlock(&dentry->d_lock); 753 spin_unlock(&dentry->d_lock);
@@ -935,6 +773,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
935 spin_unlock(&dcache_lock); 773 spin_unlock(&dcache_lock);
936 return -ENOTEMPTY; 774 return -ENOTEMPTY;
937 } 775 }
776 autofs4_add_expiring(dentry);
938 spin_lock(&dentry->d_lock); 777 spin_lock(&dentry->d_lock);
939 __d_drop(dentry); 778 __d_drop(dentry);
940 spin_unlock(&dentry->d_lock); 779 spin_unlock(&dentry->d_lock);
@@ -972,6 +811,8 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
972 if (!ino) 811 if (!ino)
973 return -ENOMEM; 812 return -ENOMEM;
974 813
814 autofs4_del_active(dentry);
815
975 inode = autofs4_get_inode(dir->i_sb, ino); 816 inode = autofs4_get_inode(dir->i_sb, ino);
976 if (!inode) { 817 if (!inode) {
977 if (!dentry->d_fsdata) 818 if (!dentry->d_fsdata)
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index e3287d0d1a58..59096b5e0fc7 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -11,7 +11,6 @@
11 */ 11 */
12 12
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/slab.h>
15#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
16#include <linux/string.h> 15#include <linux/string.h>
17 16
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 33baf27fac78..34ddda888e63 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -873,6 +873,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
873 brelse(bh); 873 brelse(bh);
874 874
875 unacquire_priv_sbp: 875 unacquire_priv_sbp:
876 kfree(befs_sb->mount_opts.iocharset);
876 kfree(sb->s_fs_info); 877 kfree(sb->s_fs_info);
877 878
878 unacquire_none: 879 unacquire_none:
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 6f60336c6628..f22a7d3dc362 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -15,6 +15,7 @@
15#include <linux/smp_lock.h> 15#include <linux/smp_lock.h>
16#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
17#include <linux/vfs.h> 17#include <linux/vfs.h>
18#include <linux/writeback.h>
18#include <asm/uaccess.h> 19#include <asm/uaccess.h>
19#include "bfs.h" 20#include "bfs.h"
20 21
@@ -98,7 +99,7 @@ error:
98 return ERR_PTR(-EIO); 99 return ERR_PTR(-EIO);
99} 100}
100 101
101static int bfs_write_inode(struct inode *inode, int wait) 102static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
102{ 103{
103 struct bfs_sb_info *info = BFS_SB(inode->i_sb); 104 struct bfs_sb_info *info = BFS_SB(inode->i_sb);
104 unsigned int ino = (u16)inode->i_ino; 105 unsigned int ino = (u16)inode->i_ino;
@@ -147,7 +148,7 @@ static int bfs_write_inode(struct inode *inode, int wait)
147 di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); 148 di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
148 149
149 mark_buffer_dirty(bh); 150 mark_buffer_dirty(bh);
150 if (wait) { 151 if (wbc->sync_mode == WB_SYNC_ALL) {
151 sync_dirty_buffer(bh); 152 sync_dirty_buffer(bh);
152 if (buffer_req(bh) && !buffer_uptodate(bh)) 153 if (buffer_req(bh) && !buffer_uptodate(bh))
153 err = -EIO; 154 err = -EIO;
@@ -353,35 +354,35 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
353 struct inode *inode; 354 struct inode *inode;
354 unsigned i, imap_len; 355 unsigned i, imap_len;
355 struct bfs_sb_info *info; 356 struct bfs_sb_info *info;
356 long ret = -EINVAL; 357 int ret = -EINVAL;
357 unsigned long i_sblock, i_eblock, i_eoff, s_size; 358 unsigned long i_sblock, i_eblock, i_eoff, s_size;
358 359
359 info = kzalloc(sizeof(*info), GFP_KERNEL); 360 info = kzalloc(sizeof(*info), GFP_KERNEL);
360 if (!info) 361 if (!info)
361 return -ENOMEM; 362 return -ENOMEM;
363 mutex_init(&info->bfs_lock);
362 s->s_fs_info = info; 364 s->s_fs_info = info;
363 365
364 sb_set_blocksize(s, BFS_BSIZE); 366 sb_set_blocksize(s, BFS_BSIZE);
365 367
366 bh = sb_bread(s, 0); 368 info->si_sbh = sb_bread(s, 0);
367 if(!bh) 369 if (!info->si_sbh)
368 goto out; 370 goto out;
369 bfs_sb = (struct bfs_super_block *)bh->b_data; 371 bfs_sb = (struct bfs_super_block *)info->si_sbh->b_data;
370 if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) { 372 if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
371 if (!silent) 373 if (!silent)
372 printf("No BFS filesystem on %s (magic=%08x)\n", 374 printf("No BFS filesystem on %s (magic=%08x)\n",
373 s->s_id, le32_to_cpu(bfs_sb->s_magic)); 375 s->s_id, le32_to_cpu(bfs_sb->s_magic));
374 goto out; 376 goto out1;
375 } 377 }
376 if (BFS_UNCLEAN(bfs_sb, s) && !silent) 378 if (BFS_UNCLEAN(bfs_sb, s) && !silent)
377 printf("%s is unclean, continuing\n", s->s_id); 379 printf("%s is unclean, continuing\n", s->s_id);
378 380
379 s->s_magic = BFS_MAGIC; 381 s->s_magic = BFS_MAGIC;
380 info->si_sbh = bh;
381 382
382 if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) { 383 if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
383 printf("Superblock is corrupted\n"); 384 printf("Superblock is corrupted\n");
384 goto out; 385 goto out1;
385 } 386 }
386 387
387 info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / 388 info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
@@ -390,7 +391,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
390 imap_len = (info->si_lasti / 8) + 1; 391 imap_len = (info->si_lasti / 8) + 1;
391 info->si_imap = kzalloc(imap_len, GFP_KERNEL); 392 info->si_imap = kzalloc(imap_len, GFP_KERNEL);
392 if (!info->si_imap) 393 if (!info->si_imap)
393 goto out; 394 goto out1;
394 for (i = 0; i < BFS_ROOT_INO; i++) 395 for (i = 0; i < BFS_ROOT_INO; i++)
395 set_bit(i, info->si_imap); 396 set_bit(i, info->si_imap);
396 397
@@ -398,15 +399,13 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
398 inode = bfs_iget(s, BFS_ROOT_INO); 399 inode = bfs_iget(s, BFS_ROOT_INO);
399 if (IS_ERR(inode)) { 400 if (IS_ERR(inode)) {
400 ret = PTR_ERR(inode); 401 ret = PTR_ERR(inode);
401 kfree(info->si_imap); 402 goto out2;
402 goto out;
403 } 403 }
404 s->s_root = d_alloc_root(inode); 404 s->s_root = d_alloc_root(inode);
405 if (!s->s_root) { 405 if (!s->s_root) {
406 iput(inode); 406 iput(inode);
407 ret = -ENOMEM; 407 ret = -ENOMEM;
408 kfree(info->si_imap); 408 goto out2;
409 goto out;
410 } 409 }
411 410
412 info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS; 411 info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS;
@@ -419,10 +418,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
419 bh = sb_bread(s, info->si_blocks - 1); 418 bh = sb_bread(s, info->si_blocks - 1);
420 if (!bh) { 419 if (!bh) {
421 printf("Last block not available: %lu\n", info->si_blocks - 1); 420 printf("Last block not available: %lu\n", info->si_blocks - 1);
422 iput(inode);
423 ret = -EIO; 421 ret = -EIO;
424 kfree(info->si_imap); 422 goto out3;
425 goto out;
426 } 423 }
427 brelse(bh); 424 brelse(bh);
428 425
@@ -459,11 +456,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
459 printf("Inode 0x%08x corrupted\n", i); 456 printf("Inode 0x%08x corrupted\n", i);
460 457
461 brelse(bh); 458 brelse(bh);
462 s->s_root = NULL; 459 ret = -EIO;
463 kfree(info->si_imap); 460 goto out3;
464 kfree(info);
465 s->s_fs_info = NULL;
466 return -EIO;
467 } 461 }
468 462
469 if (!di->i_ino) { 463 if (!di->i_ino) {
@@ -483,11 +477,17 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
483 s->s_dirt = 1; 477 s->s_dirt = 1;
484 } 478 }
485 dump_imap("read_super", s); 479 dump_imap("read_super", s);
486 mutex_init(&info->bfs_lock);
487 return 0; 480 return 0;
488 481
482out3:
483 dput(s->s_root);
484 s->s_root = NULL;
485out2:
486 kfree(info->si_imap);
487out1:
488 brelse(info->si_sbh);
489out: 489out:
490 brelse(bh); 490 mutex_destroy(&info->bfs_lock);
491 kfree(info); 491 kfree(info);
492 s->s_fs_info = NULL; 492 s->s_fs_info = NULL;
493 return ret; 493 return ret;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 346b69405363..f96eff04e11a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -20,10 +20,11 @@
20#include <linux/fcntl.h> 20#include <linux/fcntl.h>
21#include <linux/ptrace.h> 21#include <linux/ptrace.h>
22#include <linux/user.h> 22#include <linux/user.h>
23#include <linux/slab.h>
24#include <linux/binfmts.h> 23#include <linux/binfmts.h>
25#include <linux/personality.h> 24#include <linux/personality.h>
26#include <linux/init.h> 25#include <linux/init.h>
26#include <linux/coredump.h>
27#include <linux/slab.h>
27 28
28#include <asm/system.h> 29#include <asm/system.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
@@ -60,26 +61,6 @@ static int set_brk(unsigned long start, unsigned long end)
60} 61}
61 62
62/* 63/*
63 * These are the only things you should do on a core-file: use only these
64 * macros to write out all the necessary info.
65 */
66
67static int dump_write(struct file *file, const void *addr, int nr)
68{
69 return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
70}
71
72#define DUMP_WRITE(addr, nr) \
73 if (!dump_write(file, (void *)(addr), (nr))) \
74 goto end_coredump;
75
76#define DUMP_SEEK(offset) \
77if (file->f_op->llseek) { \
78 if (file->f_op->llseek(file,(offset),0) != (offset)) \
79 goto end_coredump; \
80} else file->f_pos = (offset)
81
82/*
83 * Routine writes a core dump image in the current directory. 64 * Routine writes a core dump image in the current directory.
84 * Currently only a stub-function. 65 * Currently only a stub-function.
85 * 66 *
@@ -94,14 +75,16 @@ static int aout_core_dump(struct coredump_params *cprm)
94 struct file *file = cprm->file; 75 struct file *file = cprm->file;
95 mm_segment_t fs; 76 mm_segment_t fs;
96 int has_dumped = 0; 77 int has_dumped = 0;
97 unsigned long dump_start, dump_size; 78 void __user *dump_start;
79 int dump_size;
98 struct user dump; 80 struct user dump;
99#ifdef __alpha__ 81#ifdef __alpha__
100# define START_DATA(u) (u.start_data) 82# define START_DATA(u) ((void __user *)u.start_data)
101#else 83#else
102# define START_DATA(u) ((u.u_tsize << PAGE_SHIFT) + u.start_code) 84# define START_DATA(u) ((void __user *)((u.u_tsize << PAGE_SHIFT) + \
85 u.start_code))
103#endif 86#endif
104# define START_STACK(u) (u.start_stack) 87# define START_STACK(u) ((void __user *)u.start_stack)
105 88
106 fs = get_fs(); 89 fs = get_fs();
107 set_fs(KERNEL_DS); 90 set_fs(KERNEL_DS);
@@ -123,33 +106,38 @@ static int aout_core_dump(struct coredump_params *cprm)
123 106
124/* make sure we actually have a data and stack area to dump */ 107/* make sure we actually have a data and stack area to dump */
125 set_fs(USER_DS); 108 set_fs(USER_DS);
126 if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) 109 if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
127 dump.u_dsize = 0; 110 dump.u_dsize = 0;
128 if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) 111 if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
129 dump.u_ssize = 0; 112 dump.u_ssize = 0;
130 113
131 set_fs(KERNEL_DS); 114 set_fs(KERNEL_DS);
132/* struct user */ 115/* struct user */
133 DUMP_WRITE(&dump,sizeof(dump)); 116 if (!dump_write(file, &dump, sizeof(dump)))
117 goto end_coredump;
134/* Now dump all of the user data. Include malloced stuff as well */ 118/* Now dump all of the user data. Include malloced stuff as well */
135 DUMP_SEEK(PAGE_SIZE); 119 if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump)))
120 goto end_coredump;
136/* now we start writing out the user space info */ 121/* now we start writing out the user space info */
137 set_fs(USER_DS); 122 set_fs(USER_DS);
138/* Dump the data area */ 123/* Dump the data area */
139 if (dump.u_dsize != 0) { 124 if (dump.u_dsize != 0) {
140 dump_start = START_DATA(dump); 125 dump_start = START_DATA(dump);
141 dump_size = dump.u_dsize << PAGE_SHIFT; 126 dump_size = dump.u_dsize << PAGE_SHIFT;
142 DUMP_WRITE(dump_start,dump_size); 127 if (!dump_write(file, dump_start, dump_size))
128 goto end_coredump;
143 } 129 }
144/* Now prepare to dump the stack area */ 130/* Now prepare to dump the stack area */
145 if (dump.u_ssize != 0) { 131 if (dump.u_ssize != 0) {
146 dump_start = START_STACK(dump); 132 dump_start = START_STACK(dump);
147 dump_size = dump.u_ssize << PAGE_SHIFT; 133 dump_size = dump.u_ssize << PAGE_SHIFT;
148 DUMP_WRITE(dump_start,dump_size); 134 if (!dump_write(file, dump_start, dump_size))
135 goto end_coredump;
149 } 136 }
150/* Finally dump the task struct. Not be used by gdb, but could be useful */ 137/* Finally dump the task struct. Not be used by gdb, but could be useful */
151 set_fs(KERNEL_DS); 138 set_fs(KERNEL_DS);
152 DUMP_WRITE(current,sizeof(*current)); 139 if (!dump_write(file, current, sizeof(*current)))
140 goto end_coredump;
153end_coredump: 141end_coredump:
154 set_fs(fs); 142 set_fs(fs);
155 return has_dumped; 143 return has_dumped;
@@ -247,7 +235,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
247 * size limits imposed on them by creating programs with large 235 * size limits imposed on them by creating programs with large
248 * arrays in the data or bss. 236 * arrays in the data or bss.
249 */ 237 */
250 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; 238 rlim = rlimit(RLIMIT_DATA);
251 if (rlim >= RLIM_INFINITY) 239 if (rlim >= RLIM_INFINITY)
252 rlim = ~0; 240 rlim = ~0;
253 if (ex.a_data + ex.a_bss > rlim) 241 if (ex.a_data + ex.a_bss > rlim)
@@ -264,6 +252,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
264#else 252#else
265 set_personality(PER_LINUX); 253 set_personality(PER_LINUX);
266#endif 254#endif
255 setup_new_exec(bprm);
267 256
268 current->mm->end_code = ex.a_text + 257 current->mm->end_code = ex.a_text +
269 (current->mm->start_code = N_TXTADDR(ex)); 258 (current->mm->start_code = N_TXTADDR(ex));
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index edd90c49003c..535e763ab1a6 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -31,6 +31,7 @@
31#include <linux/random.h> 31#include <linux/random.h>
32#include <linux/elf.h> 32#include <linux/elf.h>
33#include <linux/utsname.h> 33#include <linux/utsname.h>
34#include <linux/coredump.h>
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
35#include <asm/param.h> 36#include <asm/param.h>
36#include <asm/page.h> 37#include <asm/page.h>
@@ -662,27 +663,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
662 if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') 663 if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
663 goto out_free_interp; 664 goto out_free_interp;
664 665
665 /*
666 * The early SET_PERSONALITY here is so that the lookup
667 * for the interpreter happens in the namespace of the
668 * to-be-execed image. SET_PERSONALITY can select an
669 * alternate root.
670 *
671 * However, SET_PERSONALITY is NOT allowed to switch
672 * this task into the new images's memory mapping
673 * policy - that is, TASK_SIZE must still evaluate to
674 * that which is appropriate to the execing application.
675 * This is because exit_mmap() needs to have TASK_SIZE
676 * evaluate to the size of the old image.
677 *
678 * So if (say) a 64-bit application is execing a 32-bit
679 * application it is the architecture's responsibility
680 * to defer changing the value of TASK_SIZE until the
681 * switch really is going to happen - do this in
682 * flush_thread(). - akpm
683 */
684 SET_PERSONALITY(loc->elf_ex);
685
686 interpreter = open_exec(elf_interpreter); 666 interpreter = open_exec(elf_interpreter);
687 retval = PTR_ERR(interpreter); 667 retval = PTR_ERR(interpreter);
688 if (IS_ERR(interpreter)) 668 if (IS_ERR(interpreter))
@@ -730,9 +710,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
730 /* Verify the interpreter has a valid arch */ 710 /* Verify the interpreter has a valid arch */
731 if (!elf_check_arch(&loc->interp_elf_ex)) 711 if (!elf_check_arch(&loc->interp_elf_ex))
732 goto out_free_dentry; 712 goto out_free_dentry;
733 } else {
734 /* Executables without an interpreter also need a personality */
735 SET_PERSONALITY(loc->elf_ex);
736 } 713 }
737 714
738 /* Flush all traces of the currently running executable */ 715 /* Flush all traces of the currently running executable */
@@ -752,7 +729,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
752 729
753 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 730 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
754 current->flags |= PF_RANDOMIZE; 731 current->flags |= PF_RANDOMIZE;
755 arch_pick_mmap_layout(current->mm); 732
733 setup_new_exec(bprm);
756 734
757 /* Do this so that we can load the interpreter, if need be. We will 735 /* Do this so that we can load the interpreter, if need be. We will
758 change some of these later */ 736 change some of these later */
@@ -1108,36 +1086,6 @@ out:
1108 * Modelled on fs/exec.c:aout_core_dump() 1086 * Modelled on fs/exec.c:aout_core_dump()
1109 * Jeremy Fitzhardinge <jeremy@sw.oz.au> 1087 * Jeremy Fitzhardinge <jeremy@sw.oz.au>
1110 */ 1088 */
1111/*
1112 * These are the only things you should do on a core-file: use only these
1113 * functions to write out all the necessary info.
1114 */
1115static int dump_write(struct file *file, const void *addr, int nr)
1116{
1117 return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
1118}
1119
1120static int dump_seek(struct file *file, loff_t off)
1121{
1122 if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
1123 if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
1124 return 0;
1125 } else {
1126 char *buf = (char *)get_zeroed_page(GFP_KERNEL);
1127 if (!buf)
1128 return 0;
1129 while (off > 0) {
1130 unsigned long n = off;
1131 if (n > PAGE_SIZE)
1132 n = PAGE_SIZE;
1133 if (!dump_write(file, buf, n))
1134 return 0;
1135 off -= n;
1136 }
1137 free_page((unsigned long)buf);
1138 }
1139 return 1;
1140}
1141 1089
1142/* 1090/*
1143 * Decide what to dump of a segment, part, all or none. 1091 * Decide what to dump of a segment, part, all or none.
@@ -1272,11 +1220,6 @@ static int writenote(struct memelfnote *men, struct file *file,
1272} 1220}
1273#undef DUMP_WRITE 1221#undef DUMP_WRITE
1274 1222
1275#define DUMP_WRITE(addr, nr) \
1276 if ((size += (nr)) > cprm->limit || \
1277 !dump_write(cprm->file, (addr), (nr))) \
1278 goto end_coredump;
1279
1280static void fill_elf_header(struct elfhdr *elf, int segs, 1223static void fill_elf_header(struct elfhdr *elf, int segs,
1281 u16 machine, u32 flags, u8 osabi) 1224 u16 machine, u32 flags, u8 osabi)
1282{ 1225{
@@ -1895,6 +1838,34 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
1895 return gate_vma; 1838 return gate_vma;
1896} 1839}
1897 1840
1841static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
1842 elf_addr_t e_shoff, int segs)
1843{
1844 elf->e_shoff = e_shoff;
1845 elf->e_shentsize = sizeof(*shdr4extnum);
1846 elf->e_shnum = 1;
1847 elf->e_shstrndx = SHN_UNDEF;
1848
1849 memset(shdr4extnum, 0, sizeof(*shdr4extnum));
1850
1851 shdr4extnum->sh_type = SHT_NULL;
1852 shdr4extnum->sh_size = elf->e_shnum;
1853 shdr4extnum->sh_link = elf->e_shstrndx;
1854 shdr4extnum->sh_info = segs;
1855}
1856
1857static size_t elf_core_vma_data_size(struct vm_area_struct *gate_vma,
1858 unsigned long mm_flags)
1859{
1860 struct vm_area_struct *vma;
1861 size_t size = 0;
1862
1863 for (vma = first_vma(current, gate_vma); vma != NULL;
1864 vma = next_vma(vma, gate_vma))
1865 size += vma_dump_size(vma, mm_flags);
1866 return size;
1867}
1868
1898/* 1869/*
1899 * Actual dumper 1870 * Actual dumper
1900 * 1871 *
@@ -1911,8 +1882,11 @@ static int elf_core_dump(struct coredump_params *cprm)
1911 struct vm_area_struct *vma, *gate_vma; 1882 struct vm_area_struct *vma, *gate_vma;
1912 struct elfhdr *elf = NULL; 1883 struct elfhdr *elf = NULL;
1913 loff_t offset = 0, dataoff, foffset; 1884 loff_t offset = 0, dataoff, foffset;
1914 unsigned long mm_flags;
1915 struct elf_note_info info; 1885 struct elf_note_info info;
1886 struct elf_phdr *phdr4note = NULL;
1887 struct elf_shdr *shdr4extnum = NULL;
1888 Elf_Half e_phnum;
1889 elf_addr_t e_shoff;
1916 1890
1917 /* 1891 /*
1918 * We no longer stop all VM operations. 1892 * We no longer stop all VM operations.
@@ -1935,20 +1909,25 @@ static int elf_core_dump(struct coredump_params *cprm)
1935 * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. 1909 * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
1936 */ 1910 */
1937 segs = current->mm->map_count; 1911 segs = current->mm->map_count;
1938#ifdef ELF_CORE_EXTRA_PHDRS 1912 segs += elf_core_extra_phdrs();
1939 segs += ELF_CORE_EXTRA_PHDRS;
1940#endif
1941 1913
1942 gate_vma = get_gate_vma(current); 1914 gate_vma = get_gate_vma(current);
1943 if (gate_vma != NULL) 1915 if (gate_vma != NULL)
1944 segs++; 1916 segs++;
1945 1917
1918 /* for notes section */
1919 segs++;
1920
1921 /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
1922 * this, kernel supports extended numbering. Have a look at
1923 * include/linux/elf.h for further information. */
1924 e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
1925
1946 /* 1926 /*
1947 * Collect all the non-memory information about the process for the 1927 * Collect all the non-memory information about the process for the
1948 * notes. This also sets up the file header. 1928 * notes. This also sets up the file header.
1949 */ 1929 */
1950 if (!fill_note_info(elf, segs + 1, /* including notes section */ 1930 if (!fill_note_info(elf, e_phnum, &info, cprm->signr, cprm->regs))
1951 &info, cprm->signr, cprm->regs))
1952 goto cleanup; 1931 goto cleanup;
1953 1932
1954 has_dumped = 1; 1933 has_dumped = 1;
@@ -1957,31 +1936,47 @@ static int elf_core_dump(struct coredump_params *cprm)
1957 fs = get_fs(); 1936 fs = get_fs();
1958 set_fs(KERNEL_DS); 1937 set_fs(KERNEL_DS);
1959 1938
1960 DUMP_WRITE(elf, sizeof(*elf));
1961 offset += sizeof(*elf); /* Elf header */ 1939 offset += sizeof(*elf); /* Elf header */
1962 offset += (segs + 1) * sizeof(struct elf_phdr); /* Program headers */ 1940 offset += segs * sizeof(struct elf_phdr); /* Program headers */
1963 foffset = offset; 1941 foffset = offset;
1964 1942
1965 /* Write notes phdr entry */ 1943 /* Write notes phdr entry */
1966 { 1944 {
1967 struct elf_phdr phdr;
1968 size_t sz = get_note_info_size(&info); 1945 size_t sz = get_note_info_size(&info);
1969 1946
1970 sz += elf_coredump_extra_notes_size(); 1947 sz += elf_coredump_extra_notes_size();
1971 1948
1972 fill_elf_note_phdr(&phdr, sz, offset); 1949 phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
1950 if (!phdr4note)
1951 goto end_coredump;
1952
1953 fill_elf_note_phdr(phdr4note, sz, offset);
1973 offset += sz; 1954 offset += sz;
1974 DUMP_WRITE(&phdr, sizeof(phdr));
1975 } 1955 }
1976 1956
1977 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); 1957 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
1978 1958
1979 /* 1959 offset += elf_core_vma_data_size(gate_vma, cprm->mm_flags);
1980 * We must use the same mm->flags while dumping core to avoid 1960 offset += elf_core_extra_data_size();
1981 * inconsistency between the program headers and bodies, otherwise an 1961 e_shoff = offset;
1982 * unusable core file can be generated. 1962
1983 */ 1963 if (e_phnum == PN_XNUM) {
1984 mm_flags = current->mm->flags; 1964 shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
1965 if (!shdr4extnum)
1966 goto end_coredump;
1967 fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
1968 }
1969
1970 offset = dataoff;
1971
1972 size += sizeof(*elf);
1973 if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
1974 goto end_coredump;
1975
1976 size += sizeof(*phdr4note);
1977 if (size > cprm->limit
1978 || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
1979 goto end_coredump;
1985 1980
1986 /* Write program headers for segments dump */ 1981 /* Write program headers for segments dump */
1987 for (vma = first_vma(current, gate_vma); vma != NULL; 1982 for (vma = first_vma(current, gate_vma); vma != NULL;
@@ -1992,7 +1987,7 @@ static int elf_core_dump(struct coredump_params *cprm)
1992 phdr.p_offset = offset; 1987 phdr.p_offset = offset;
1993 phdr.p_vaddr = vma->vm_start; 1988 phdr.p_vaddr = vma->vm_start;
1994 phdr.p_paddr = 0; 1989 phdr.p_paddr = 0;
1995 phdr.p_filesz = vma_dump_size(vma, mm_flags); 1990 phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags);
1996 phdr.p_memsz = vma->vm_end - vma->vm_start; 1991 phdr.p_memsz = vma->vm_end - vma->vm_start;
1997 offset += phdr.p_filesz; 1992 offset += phdr.p_filesz;
1998 phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; 1993 phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
@@ -2002,12 +1997,14 @@ static int elf_core_dump(struct coredump_params *cprm)
2002 phdr.p_flags |= PF_X; 1997 phdr.p_flags |= PF_X;
2003 phdr.p_align = ELF_EXEC_PAGESIZE; 1998 phdr.p_align = ELF_EXEC_PAGESIZE;
2004 1999
2005 DUMP_WRITE(&phdr, sizeof(phdr)); 2000 size += sizeof(phdr);
2001 if (size > cprm->limit
2002 || !dump_write(cprm->file, &phdr, sizeof(phdr)))
2003 goto end_coredump;
2006 } 2004 }
2007 2005
2008#ifdef ELF_CORE_WRITE_EXTRA_PHDRS 2006 if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
2009 ELF_CORE_WRITE_EXTRA_PHDRS; 2007 goto end_coredump;
2010#endif
2011 2008
2012 /* write out the notes section */ 2009 /* write out the notes section */
2013 if (!write_note_info(&info, cprm->file, &foffset)) 2010 if (!write_note_info(&info, cprm->file, &foffset))
@@ -2025,7 +2022,7 @@ static int elf_core_dump(struct coredump_params *cprm)
2025 unsigned long addr; 2022 unsigned long addr;
2026 unsigned long end; 2023 unsigned long end;
2027 2024
2028 end = vma->vm_start + vma_dump_size(vma, mm_flags); 2025 end = vma->vm_start + vma_dump_size(vma, cprm->mm_flags);
2029 2026
2030 for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { 2027 for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
2031 struct page *page; 2028 struct page *page;
@@ -2046,15 +2043,24 @@ static int elf_core_dump(struct coredump_params *cprm)
2046 } 2043 }
2047 } 2044 }
2048 2045
2049#ifdef ELF_CORE_WRITE_EXTRA_DATA 2046 if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
2050 ELF_CORE_WRITE_EXTRA_DATA; 2047 goto end_coredump;
2051#endif 2048
2049 if (e_phnum == PN_XNUM) {
2050 size += sizeof(*shdr4extnum);
2051 if (size > cprm->limit
2052 || !dump_write(cprm->file, shdr4extnum,
2053 sizeof(*shdr4extnum)))
2054 goto end_coredump;
2055 }
2052 2056
2053end_coredump: 2057end_coredump:
2054 set_fs(fs); 2058 set_fs(fs);
2055 2059
2056cleanup: 2060cleanup:
2057 free_note_info(&info); 2061 free_note_info(&info);
2062 kfree(shdr4extnum);
2063 kfree(phdr4note);
2058 kfree(elf); 2064 kfree(elf);
2059out: 2065out:
2060 return has_dumped; 2066 return has_dumped;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index c25256a5c5b0..2c5f9a0e5d72 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -34,6 +34,7 @@
34#include <linux/elf.h> 34#include <linux/elf.h>
35#include <linux/elf-fdpic.h> 35#include <linux/elf-fdpic.h>
36#include <linux/elfcore.h> 36#include <linux/elfcore.h>
37#include <linux/coredump.h>
37 38
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
39#include <asm/param.h> 40#include <asm/param.h>
@@ -171,6 +172,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
171#ifdef ELF_FDPIC_PLAT_INIT 172#ifdef ELF_FDPIC_PLAT_INIT
172 unsigned long dynaddr; 173 unsigned long dynaddr;
173#endif 174#endif
175#ifndef CONFIG_MMU
176 unsigned long stack_prot;
177#endif
174 struct file *interpreter = NULL; /* to shut gcc up */ 178 struct file *interpreter = NULL; /* to shut gcc up */
175 char *interpreter_name = NULL; 179 char *interpreter_name = NULL;
176 int executable_stack; 180 int executable_stack;
@@ -316,6 +320,11 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
316 * defunct, deceased, etc. after this point we have to exit via 320 * defunct, deceased, etc. after this point we have to exit via
317 * error_kill */ 321 * error_kill */
318 set_personality(PER_LINUX_FDPIC); 322 set_personality(PER_LINUX_FDPIC);
323 if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
324 current->personality |= READ_IMPLIES_EXEC;
325
326 setup_new_exec(bprm);
327
319 set_binfmt(&elf_fdpic_format); 328 set_binfmt(&elf_fdpic_format);
320 329
321 current->mm->start_code = 0; 330 current->mm->start_code = 0;
@@ -377,9 +386,13 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
377 if (stack_size < PAGE_SIZE * 2) 386 if (stack_size < PAGE_SIZE * 2)
378 stack_size = PAGE_SIZE * 2; 387 stack_size = PAGE_SIZE * 2;
379 388
389 stack_prot = PROT_READ | PROT_WRITE;
390 if (executable_stack == EXSTACK_ENABLE_X ||
391 (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC))
392 stack_prot |= PROT_EXEC;
393
380 down_write(&current->mm->mmap_sem); 394 down_write(&current->mm->mmap_sem);
381 current->mm->start_brk = do_mmap(NULL, 0, stack_size, 395 current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot,
382 PROT_READ | PROT_WRITE | PROT_EXEC,
383 MAP_PRIVATE | MAP_ANONYMOUS | 396 MAP_PRIVATE | MAP_ANONYMOUS |
384 MAP_UNINITIALIZED | MAP_GROWSDOWN, 397 MAP_UNINITIALIZED | MAP_GROWSDOWN,
385 0); 398 0);
@@ -992,15 +1005,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
992 } 1005 }
993 } else if (!mm->start_data) { 1006 } else if (!mm->start_data) {
994 mm->start_data = seg->addr; 1007 mm->start_data = seg->addr;
995#ifndef CONFIG_MMU
996 mm->end_data = seg->addr + phdr->p_memsz; 1008 mm->end_data = seg->addr + phdr->p_memsz;
997#endif
998 } 1009 }
999
1000#ifdef CONFIG_MMU
1001 if (seg->addr + phdr->p_memsz > mm->end_data)
1002 mm->end_data = seg->addr + phdr->p_memsz;
1003#endif
1004 } 1010 }
1005 1011
1006 seg++; 1012 seg++;
@@ -1204,26 +1210,6 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1204#ifdef CONFIG_ELF_CORE 1210#ifdef CONFIG_ELF_CORE
1205 1211
1206/* 1212/*
1207 * These are the only things you should do on a core-file: use only these
1208 * functions to write out all the necessary info.
1209 */
1210static int dump_write(struct file *file, const void *addr, int nr)
1211{
1212 return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
1213}
1214
1215static int dump_seek(struct file *file, loff_t off)
1216{
1217 if (file->f_op->llseek) {
1218 if (file->f_op->llseek(file, off, SEEK_SET) != off)
1219 return 0;
1220 } else {
1221 file->f_pos = off;
1222 }
1223 return 1;
1224}
1225
1226/*
1227 * Decide whether a segment is worth dumping; default is yes to be 1213 * Decide whether a segment is worth dumping; default is yes to be
1228 * sure (missing info is worse than too much; etc). 1214 * sure (missing info is worse than too much; etc).
1229 * Personally I'd include everything, and use the coredump limit... 1215 * Personally I'd include everything, and use the coredump limit...
@@ -1301,35 +1287,35 @@ static int notesize(struct memelfnote *en)
1301 1287
1302/* #define DEBUG */ 1288/* #define DEBUG */
1303 1289
1304#define DUMP_WRITE(addr, nr) \ 1290#define DUMP_WRITE(addr, nr, foffset) \
1305 do { if (!dump_write(file, (addr), (nr))) return 0; } while(0) 1291 do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0)
1306#define DUMP_SEEK(off) \
1307 do { if (!dump_seek(file, (off))) return 0; } while(0)
1308 1292
1309static int writenote(struct memelfnote *men, struct file *file) 1293static int alignfile(struct file *file, loff_t *foffset)
1310{ 1294{
1311 struct elf_note en; 1295 static const char buf[4] = { 0, };
1296 DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
1297 return 1;
1298}
1312 1299
1300static int writenote(struct memelfnote *men, struct file *file,
1301 loff_t *foffset)
1302{
1303 struct elf_note en;
1313 en.n_namesz = strlen(men->name) + 1; 1304 en.n_namesz = strlen(men->name) + 1;
1314 en.n_descsz = men->datasz; 1305 en.n_descsz = men->datasz;
1315 en.n_type = men->type; 1306 en.n_type = men->type;
1316 1307
1317 DUMP_WRITE(&en, sizeof(en)); 1308 DUMP_WRITE(&en, sizeof(en), foffset);
1318 DUMP_WRITE(men->name, en.n_namesz); 1309 DUMP_WRITE(men->name, en.n_namesz, foffset);
1319 /* XXX - cast from long long to long to avoid need for libgcc.a */ 1310 if (!alignfile(file, foffset))
1320 DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ 1311 return 0;
1321 DUMP_WRITE(men->data, men->datasz); 1312 DUMP_WRITE(men->data, men->datasz, foffset);
1322 DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ 1313 if (!alignfile(file, foffset))
1314 return 0;
1323 1315
1324 return 1; 1316 return 1;
1325} 1317}
1326#undef DUMP_WRITE 1318#undef DUMP_WRITE
1327#undef DUMP_SEEK
1328
1329#define DUMP_WRITE(addr, nr) \
1330 if ((size += (nr)) > cprm->limit || \
1331 !dump_write(cprm->file, (addr), (nr))) \
1332 goto end_coredump;
1333 1319
1334static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) 1320static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
1335{ 1321{
@@ -1381,7 +1367,7 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type
1381 1367
1382/* 1368/*
1383 * fill up all the fields in prstatus from the given task struct, except 1369 * fill up all the fields in prstatus from the given task struct, except
1384 * registers which need to be filled up seperately. 1370 * registers which need to be filled up separately.
1385 */ 1371 */
1386static void fill_prstatus(struct elf_prstatus *prstatus, 1372static void fill_prstatus(struct elf_prstatus *prstatus,
1387 struct task_struct *p, long signr) 1373 struct task_struct *p, long signr)
@@ -1512,6 +1498,22 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
1512 return sz; 1498 return sz;
1513} 1499}
1514 1500
1501static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
1502 elf_addr_t e_shoff, int segs)
1503{
1504 elf->e_shoff = e_shoff;
1505 elf->e_shentsize = sizeof(*shdr4extnum);
1506 elf->e_shnum = 1;
1507 elf->e_shstrndx = SHN_UNDEF;
1508
1509 memset(shdr4extnum, 0, sizeof(*shdr4extnum));
1510
1511 shdr4extnum->sh_type = SHT_NULL;
1512 shdr4extnum->sh_size = elf->e_shnum;
1513 shdr4extnum->sh_link = elf->e_shstrndx;
1514 shdr4extnum->sh_info = segs;
1515}
1516
1515/* 1517/*
1516 * dump the segments for an MMU process 1518 * dump the segments for an MMU process
1517 */ 1519 */
@@ -1540,7 +1542,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
1540 err = -EIO; 1542 err = -EIO;
1541 kunmap(page); 1543 kunmap(page);
1542 page_cache_release(page); 1544 page_cache_release(page);
1543 } else if (!dump_seek(file, file->f_pos + PAGE_SIZE)) 1545 } else if (!dump_seek(file, PAGE_SIZE))
1544 err = -EFBIG; 1546 err = -EFBIG;
1545 if (err) 1547 if (err)
1546 goto out; 1548 goto out;
@@ -1576,6 +1578,17 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
1576} 1578}
1577#endif 1579#endif
1578 1580
1581static size_t elf_core_vma_data_size(unsigned long mm_flags)
1582{
1583 struct vm_area_struct *vma;
1584 size_t size = 0;
1585
1586 for (vma = current->mm->mmap; vma; vma = vma->vm_next)
1587 if (maydump(vma, mm_flags))
1588 size += vma->vm_end - vma->vm_start;
1589 return size;
1590}
1591
1579/* 1592/*
1580 * Actual dumper 1593 * Actual dumper
1581 * 1594 *
@@ -1593,7 +1606,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1593 int i; 1606 int i;
1594 struct vm_area_struct *vma; 1607 struct vm_area_struct *vma;
1595 struct elfhdr *elf = NULL; 1608 struct elfhdr *elf = NULL;
1596 loff_t offset = 0, dataoff; 1609 loff_t offset = 0, dataoff, foffset;
1597 int numnote; 1610 int numnote;
1598 struct memelfnote *notes = NULL; 1611 struct memelfnote *notes = NULL;
1599 struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */ 1612 struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */
@@ -1606,7 +1619,10 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1606#endif 1619#endif
1607 int thread_status_size = 0; 1620 int thread_status_size = 0;
1608 elf_addr_t *auxv; 1621 elf_addr_t *auxv;
1609 unsigned long mm_flags; 1622 struct elf_phdr *phdr4note = NULL;
1623 struct elf_shdr *shdr4extnum = NULL;
1624 Elf_Half e_phnum;
1625 elf_addr_t e_shoff;
1610 1626
1611 /* 1627 /*
1612 * We no longer stop all VM operations. 1628 * We no longer stop all VM operations.
@@ -1671,12 +1687,18 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1671 elf_core_copy_regs(&prstatus->pr_reg, cprm->regs); 1687 elf_core_copy_regs(&prstatus->pr_reg, cprm->regs);
1672 1688
1673 segs = current->mm->map_count; 1689 segs = current->mm->map_count;
1674#ifdef ELF_CORE_EXTRA_PHDRS 1690 segs += elf_core_extra_phdrs();
1675 segs += ELF_CORE_EXTRA_PHDRS; 1691
1676#endif 1692 /* for notes section */
1693 segs++;
1694
1695 /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
1696 * this, kernel supports extended numbering. Have a look at
1697 * include/linux/elf.h for further information. */
1698 e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
1677 1699
1678 /* Set up header */ 1700 /* Set up header */
1679 fill_elf_fdpic_header(elf, segs + 1); /* including notes section */ 1701 fill_elf_fdpic_header(elf, e_phnum);
1680 1702
1681 has_dumped = 1; 1703 has_dumped = 1;
1682 current->flags |= PF_DUMPCORE; 1704 current->flags |= PF_DUMPCORE;
@@ -1715,13 +1737,12 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1715 fs = get_fs(); 1737 fs = get_fs();
1716 set_fs(KERNEL_DS); 1738 set_fs(KERNEL_DS);
1717 1739
1718 DUMP_WRITE(elf, sizeof(*elf));
1719 offset += sizeof(*elf); /* Elf header */ 1740 offset += sizeof(*elf); /* Elf header */
1720 offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ 1741 offset += segs * sizeof(struct elf_phdr); /* Program headers */
1742 foffset = offset;
1721 1743
1722 /* Write notes phdr entry */ 1744 /* Write notes phdr entry */
1723 { 1745 {
1724 struct elf_phdr phdr;
1725 int sz = 0; 1746 int sz = 0;
1726 1747
1727 for (i = 0; i < numnote; i++) 1748 for (i = 0; i < numnote; i++)
@@ -1729,20 +1750,38 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1729 1750
1730 sz += thread_status_size; 1751 sz += thread_status_size;
1731 1752
1732 fill_elf_note_phdr(&phdr, sz, offset); 1753 phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
1754 if (!phdr4note)
1755 goto end_coredump;
1756
1757 fill_elf_note_phdr(phdr4note, sz, offset);
1733 offset += sz; 1758 offset += sz;
1734 DUMP_WRITE(&phdr, sizeof(phdr));
1735 } 1759 }
1736 1760
1737 /* Page-align dumped data */ 1761 /* Page-align dumped data */
1738 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); 1762 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
1739 1763
1740 /* 1764 offset += elf_core_vma_data_size(cprm->mm_flags);
1741 * We must use the same mm->flags while dumping core to avoid 1765 offset += elf_core_extra_data_size();
1742 * inconsistency between the program headers and bodies, otherwise an 1766 e_shoff = offset;
1743 * unusable core file can be generated. 1767
1744 */ 1768 if (e_phnum == PN_XNUM) {
1745 mm_flags = current->mm->flags; 1769 shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
1770 if (!shdr4extnum)
1771 goto end_coredump;
1772 fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
1773 }
1774
1775 offset = dataoff;
1776
1777 size += sizeof(*elf);
1778 if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
1779 goto end_coredump;
1780
1781 size += sizeof(*phdr4note);
1782 if (size > cprm->limit
1783 || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
1784 goto end_coredump;
1746 1785
1747 /* write program headers for segments dump */ 1786 /* write program headers for segments dump */
1748 for (vma = current->mm->mmap; vma; vma = vma->vm_next) { 1787 for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
@@ -1755,7 +1794,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1755 phdr.p_offset = offset; 1794 phdr.p_offset = offset;
1756 phdr.p_vaddr = vma->vm_start; 1795 phdr.p_vaddr = vma->vm_start;
1757 phdr.p_paddr = 0; 1796 phdr.p_paddr = 0;
1758 phdr.p_filesz = maydump(vma, mm_flags) ? sz : 0; 1797 phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0;
1759 phdr.p_memsz = sz; 1798 phdr.p_memsz = sz;
1760 offset += phdr.p_filesz; 1799 offset += phdr.p_filesz;
1761 phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; 1800 phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
@@ -1765,16 +1804,18 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1765 phdr.p_flags |= PF_X; 1804 phdr.p_flags |= PF_X;
1766 phdr.p_align = ELF_EXEC_PAGESIZE; 1805 phdr.p_align = ELF_EXEC_PAGESIZE;
1767 1806
1768 DUMP_WRITE(&phdr, sizeof(phdr)); 1807 size += sizeof(phdr);
1808 if (size > cprm->limit
1809 || !dump_write(cprm->file, &phdr, sizeof(phdr)))
1810 goto end_coredump;
1769 } 1811 }
1770 1812
1771#ifdef ELF_CORE_WRITE_EXTRA_PHDRS 1813 if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
1772 ELF_CORE_WRITE_EXTRA_PHDRS; 1814 goto end_coredump;
1773#endif
1774 1815
1775 /* write out the notes section */ 1816 /* write out the notes section */
1776 for (i = 0; i < numnote; i++) 1817 for (i = 0; i < numnote; i++)
1777 if (!writenote(notes + i, cprm->file)) 1818 if (!writenote(notes + i, cprm->file, &foffset))
1778 goto end_coredump; 1819 goto end_coredump;
1779 1820
1780 /* write out the thread status notes section */ 1821 /* write out the thread status notes section */
@@ -1783,26 +1824,33 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
1783 list_entry(t, struct elf_thread_status, list); 1824 list_entry(t, struct elf_thread_status, list);
1784 1825
1785 for (i = 0; i < tmp->num_notes; i++) 1826 for (i = 0; i < tmp->num_notes; i++)
1786 if (!writenote(&tmp->notes[i], cprm->file)) 1827 if (!writenote(&tmp->notes[i], cprm->file, &foffset))
1787 goto end_coredump; 1828 goto end_coredump;
1788 } 1829 }
1789 1830
1790 if (!dump_seek(cprm->file, dataoff)) 1831 if (!dump_seek(cprm->file, dataoff - foffset))
1791 goto end_coredump; 1832 goto end_coredump;
1792 1833
1793 if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit, 1834 if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit,
1794 mm_flags) < 0) 1835 cprm->mm_flags) < 0)
1795 goto end_coredump; 1836 goto end_coredump;
1796 1837
1797#ifdef ELF_CORE_WRITE_EXTRA_DATA 1838 if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
1798 ELF_CORE_WRITE_EXTRA_DATA; 1839 goto end_coredump;
1799#endif
1800 1840
1801 if (file->f_pos != offset) { 1841 if (e_phnum == PN_XNUM) {
1842 size += sizeof(*shdr4extnum);
1843 if (size > cprm->limit
1844 || !dump_write(cprm->file, shdr4extnum,
1845 sizeof(*shdr4extnum)))
1846 goto end_coredump;
1847 }
1848
1849 if (cprm->file->f_pos != offset) {
1802 /* Sanity check */ 1850 /* Sanity check */
1803 printk(KERN_WARNING 1851 printk(KERN_WARNING
1804 "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n", 1852 "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n",
1805 file->f_pos, offset); 1853 cprm->file->f_pos, offset);
1806 } 1854 }
1807 1855
1808end_coredump: 1856end_coredump:
@@ -1814,7 +1862,7 @@ cleanup:
1814 list_del(tmp); 1862 list_del(tmp);
1815 kfree(list_entry(tmp, struct elf_thread_status, list)); 1863 kfree(list_entry(tmp, struct elf_thread_status, list));
1816 } 1864 }
1817 1865 kfree(phdr4note);
1818 kfree(elf); 1866 kfree(elf);
1819 kfree(prstatus); 1867 kfree(prstatus);
1820 kfree(psinfo); 1868 kfree(psinfo);
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 32fb00b52cd0..b8e8b0acf9bd 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -11,7 +11,6 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/stat.h> 13#include <linux/stat.h>
14#include <linux/slab.h>
15#include <linux/binfmts.h> 14#include <linux/binfmts.h>
16#include <linux/elf.h> 15#include <linux/elf.h>
17#include <linux/init.h> 16#include <linux/init.h>
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index d4a00ea1054c..49566c1687d8 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -355,7 +355,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
355 355
356 if (!flat_reloc_valid(r, start_brk - start_data + text_len)) { 356 if (!flat_reloc_valid(r, start_brk - start_data + text_len)) {
357 printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)", 357 printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)",
358 (int) r,(int)(start_brk-start_code),(int)text_len); 358 (int) r,(int)(start_brk-start_data+text_len),(int)text_len);
359 goto failed; 359 goto failed;
360 } 360 }
361 361
@@ -501,7 +501,7 @@ static int load_flat_file(struct linux_binprm * bprm,
501 * size limits imposed on them by creating programs with large 501 * size limits imposed on them by creating programs with large
502 * arrays in the data or bss. 502 * arrays in the data or bss.
503 */ 503 */
504 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; 504 rlim = rlimit(RLIMIT_DATA);
505 if (rlim >= RLIM_INFINITY) 505 if (rlim >= RLIM_INFINITY)
506 rlim = ~0; 506 rlim = ~0;
507 if (data_len + bss_len > rlim) { 507 if (data_len + bss_len > rlim) {
@@ -519,6 +519,7 @@ static int load_flat_file(struct linux_binprm * bprm,
519 519
520 /* OK, This is the point of no return */ 520 /* OK, This is the point of no return */
521 set_personality(PER_LINUX_32BIT); 521 set_personality(PER_LINUX_32BIT);
522 setup_new_exec(bprm);
522 } 523 }
523 524
524 /* 525 /*
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 08343505e184..aca9d55afb22 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -8,7 +8,6 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/string.h> 9#include <linux/string.h>
10#include <linux/stat.h> 10#include <linux/stat.h>
11#include <linux/slab.h>
12#include <linux/binfmts.h> 11#include <linux/binfmts.h>
13#include <linux/init.h> 12#include <linux/init.h>
14#include <linux/file.h> 13#include <linux/file.h>
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 2a9b5330cc5e..cc8560f6c9b0 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -227,6 +227,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
227 /* OK, This is the point of no return */ 227 /* OK, This is the point of no return */
228 current->flags &= ~PF_FORKNOEXEC; 228 current->flags &= ~PF_FORKNOEXEC;
229 current->personality = PER_HPUX; 229 current->personality = PER_HPUX;
230 setup_new_exec(bprm);
230 231
231 /* Set the task size for HP-UX processes such that 232 /* Set the task size for HP-UX processes such that
232 * the gateway page is outside the address space. 233 * the gateway page is outside the address space.
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 49a34e7f7306..612a5c38d3c1 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -24,6 +24,7 @@
24#include <linux/mempool.h> 24#include <linux/mempool.h>
25#include <linux/bio.h> 25#include <linux/bio.h>
26#include <linux/workqueue.h> 26#include <linux/workqueue.h>
27#include <linux/slab.h>
27 28
28struct integrity_slab { 29struct integrity_slab {
29 struct kmem_cache *slab; 30 struct kmem_cache *slab;
@@ -61,7 +62,7 @@ static inline unsigned int vecs_to_idx(unsigned int nr)
61 62
62static inline int use_bip_pool(unsigned int idx) 63static inline int use_bip_pool(unsigned int idx)
63{ 64{
64 if (idx == BIOVEC_NR_POOLS) 65 if (idx == BIOVEC_MAX_IDX)
65 return 1; 66 return 1;
66 67
67 return 0; 68 return 0;
@@ -95,6 +96,7 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
95 96
96 /* Use mempool if lower order alloc failed or max vecs were requested */ 97 /* Use mempool if lower order alloc failed or max vecs were requested */
97 if (bip == NULL) { 98 if (bip == NULL) {
99 idx = BIOVEC_MAX_IDX; /* so we free the payload properly later */
98 bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); 100 bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
99 101
100 if (unlikely(bip == NULL)) { 102 if (unlikely(bip == NULL)) {
diff --git a/fs/bio.c b/fs/bio.c
index 76e6713abf94..e7bf6ca64dcf 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -78,7 +78,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
78 78
79 i = 0; 79 i = 0;
80 while (i < bio_slab_nr) { 80 while (i < bio_slab_nr) {
81 struct bio_slab *bslab = &bio_slabs[i]; 81 bslab = &bio_slabs[i];
82 82
83 if (!bslab->slab && entry == -1) 83 if (!bslab->slab && entry == -1)
84 entry = i; 84 entry = i;
@@ -264,13 +264,12 @@ EXPORT_SYMBOL(bio_init);
264 * bio_alloc_bioset - allocate a bio for I/O 264 * bio_alloc_bioset - allocate a bio for I/O
265 * @gfp_mask: the GFP_ mask given to the slab allocator 265 * @gfp_mask: the GFP_ mask given to the slab allocator
266 * @nr_iovecs: number of iovecs to pre-allocate 266 * @nr_iovecs: number of iovecs to pre-allocate
267 * @bs: the bio_set to allocate from. If %NULL, just use kmalloc 267 * @bs: the bio_set to allocate from.
268 * 268 *
269 * Description: 269 * Description:
270 * bio_alloc_bioset will first try its own mempool to satisfy the allocation. 270 * bio_alloc_bioset will try its own mempool to satisfy the allocation.
271 * If %__GFP_WAIT is set then we will block on the internal pool waiting 271 * If %__GFP_WAIT is set then we will block on the internal pool waiting
272 * for a &struct bio to become free. If a %NULL @bs is passed in, we will 272 * for a &struct bio to become free.
273 * fall back to just using @kmalloc to allocate the required memory.
274 * 273 *
275 * Note that the caller must set ->bi_destructor on successful return 274 * Note that the caller must set ->bi_destructor on successful return
276 * of a bio, to do the appropriate freeing of the bio once the reference 275 * of a bio, to do the appropriate freeing of the bio once the reference
@@ -507,10 +506,8 @@ int bio_get_nr_vecs(struct block_device *bdev)
507 int nr_pages; 506 int nr_pages;
508 507
509 nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; 508 nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
510 if (nr_pages > queue_max_phys_segments(q)) 509 if (nr_pages > queue_max_segments(q))
511 nr_pages = queue_max_phys_segments(q); 510 nr_pages = queue_max_segments(q);
512 if (nr_pages > queue_max_hw_segments(q))
513 nr_pages = queue_max_hw_segments(q);
514 511
515 return nr_pages; 512 return nr_pages;
516} 513}
@@ -542,17 +539,22 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
542 539
543 if (page == prev->bv_page && 540 if (page == prev->bv_page &&
544 offset == prev->bv_offset + prev->bv_len) { 541 offset == prev->bv_offset + prev->bv_len) {
542 unsigned int prev_bv_len = prev->bv_len;
545 prev->bv_len += len; 543 prev->bv_len += len;
546 544
547 if (q->merge_bvec_fn) { 545 if (q->merge_bvec_fn) {
548 struct bvec_merge_data bvm = { 546 struct bvec_merge_data bvm = {
547 /* prev_bvec is already charged in
548 bi_size, discharge it in order to
549 simulate merging updated prev_bvec
550 as new bvec. */
549 .bi_bdev = bio->bi_bdev, 551 .bi_bdev = bio->bi_bdev,
550 .bi_sector = bio->bi_sector, 552 .bi_sector = bio->bi_sector,
551 .bi_size = bio->bi_size, 553 .bi_size = bio->bi_size - prev_bv_len,
552 .bi_rw = bio->bi_rw, 554 .bi_rw = bio->bi_rw,
553 }; 555 };
554 556
555 if (q->merge_bvec_fn(q, &bvm, prev) < len) { 557 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
556 prev->bv_len -= len; 558 prev->bv_len -= len;
557 return 0; 559 return 0;
558 } 560 }
@@ -570,8 +572,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
570 * make this too complex. 572 * make this too complex.
571 */ 573 */
572 574
573 while (bio->bi_phys_segments >= queue_max_phys_segments(q) 575 while (bio->bi_phys_segments >= queue_max_segments(q)) {
574 || bio->bi_phys_segments >= queue_max_hw_segments(q)) {
575 576
576 if (retried_segments) 577 if (retried_segments)
577 return 0; 578 return 0;
@@ -606,7 +607,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
606 * merge_bvec_fn() returns number of bytes it can accept 607 * merge_bvec_fn() returns number of bytes it can accept
607 * at this offset 608 * at this offset
608 */ 609 */
609 if (q->merge_bvec_fn(q, &bvm, bvec) < len) { 610 if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
610 bvec->bv_page = NULL; 611 bvec->bv_page = NULL;
611 bvec->bv_len = 0; 612 bvec->bv_len = 0;
612 bvec->bv_offset = 0; 613 bvec->bv_offset = 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 73d6a735b8f3..6dcee88c2e5d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -246,7 +246,8 @@ struct super_block *freeze_bdev(struct block_device *bdev)
246 if (!sb) 246 if (!sb)
247 goto out; 247 goto out;
248 if (sb->s_flags & MS_RDONLY) { 248 if (sb->s_flags & MS_RDONLY) {
249 deactivate_locked_super(sb); 249 sb->s_frozen = SB_FREEZE_TRANS;
250 up_write(&sb->s_umount);
250 mutex_unlock(&bdev->bd_fsfreeze_mutex); 251 mutex_unlock(&bdev->bd_fsfreeze_mutex);
251 return sb; 252 return sb;
252 } 253 }
@@ -307,7 +308,7 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
307 BUG_ON(sb->s_bdev != bdev); 308 BUG_ON(sb->s_bdev != bdev);
308 down_write(&sb->s_umount); 309 down_write(&sb->s_umount);
309 if (sb->s_flags & MS_RDONLY) 310 if (sb->s_flags & MS_RDONLY)
310 goto out_deactivate; 311 goto out_unfrozen;
311 312
312 if (sb->s_op->unfreeze_fs) { 313 if (sb->s_op->unfreeze_fs) {
313 error = sb->s_op->unfreeze_fs(sb); 314 error = sb->s_op->unfreeze_fs(sb);
@@ -321,11 +322,11 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
321 } 322 }
322 } 323 }
323 324
325out_unfrozen:
324 sb->s_frozen = SB_UNFROZEN; 326 sb->s_frozen = SB_UNFROZEN;
325 smp_wmb(); 327 smp_wmb();
326 wake_up(&sb->s_wait_unfrozen); 328 wake_up(&sb->s_wait_unfrozen);
327 329
328out_deactivate:
329 if (sb) 330 if (sb)
330 deactivate_locked_super(sb); 331 deactivate_locked_super(sb);
331out_unlock: 332out_unlock:
@@ -403,20 +404,28 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
403 * NULL first argument is nfsd_sync_dir() and that's not a directory. 404 * NULL first argument is nfsd_sync_dir() and that's not a directory.
404 */ 405 */
405 406
406static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) 407int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
407{ 408{
408 struct block_device *bdev = I_BDEV(filp->f_mapping->host); 409 struct inode *bd_inode = filp->f_mapping->host;
410 struct block_device *bdev = I_BDEV(bd_inode);
409 int error; 411 int error;
410 412
411 error = sync_blockdev(bdev); 413 /*
412 if (error) 414 * There is no need to serialise calls to blkdev_issue_flush with
413 return error; 415 * i_mutex and doing so causes performance issues with concurrent
414 416 * O_SYNC writers to a block device.
417 */
418 mutex_unlock(&bd_inode->i_mutex);
419
415 error = blkdev_issue_flush(bdev, NULL); 420 error = blkdev_issue_flush(bdev, NULL);
416 if (error == -EOPNOTSUPP) 421 if (error == -EOPNOTSUPP)
417 error = 0; 422 error = 0;
423
424 mutex_lock(&bd_inode->i_mutex);
425
418 return error; 426 return error;
419} 427}
428EXPORT_SYMBOL(blkdev_fsync);
420 429
421/* 430/*
422 * pseudo-fs 431 * pseudo-fs
@@ -1480,7 +1489,7 @@ const struct file_operations def_blk_fops = {
1480 .aio_read = generic_file_aio_read, 1489 .aio_read = generic_file_aio_read,
1481 .aio_write = blkdev_aio_write, 1490 .aio_write = blkdev_aio_write,
1482 .mmap = generic_file_mmap, 1491 .mmap = generic_file_mmap,
1483 .fsync = block_fsync, 1492 .fsync = blkdev_fsync,
1484 .unlocked_ioctl = block_ioctl, 1493 .unlocked_ioctl = block_ioctl,
1485#ifdef CONFIG_COMPAT 1494#ifdef CONFIG_COMPAT
1486 .compat_ioctl = compat_blkdev_ioctl, 1495 .compat_ioctl = compat_blkdev_ioctl,
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 2e9e69987a82..6ef7b26724ec 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -22,6 +22,7 @@
22#include <linux/posix_acl_xattr.h> 22#include <linux/posix_acl_xattr.h>
23#include <linux/posix_acl.h> 23#include <linux/posix_acl.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25 26
26#include "ctree.h" 27#include "ctree.h"
27#include "btrfs_inode.h" 28#include "btrfs_inode.h"
@@ -112,12 +113,14 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
112 switch (type) { 113 switch (type) {
113 case ACL_TYPE_ACCESS: 114 case ACL_TYPE_ACCESS:
114 mode = inode->i_mode; 115 mode = inode->i_mode;
115 ret = posix_acl_equiv_mode(acl, &mode);
116 if (ret < 0)
117 return ret;
118 ret = 0;
119 inode->i_mode = mode;
120 name = POSIX_ACL_XATTR_ACCESS; 116 name = POSIX_ACL_XATTR_ACCESS;
117 if (acl) {
118 ret = posix_acl_equiv_mode(acl, &mode);
119 if (ret < 0)
120 return ret;
121 inode->i_mode = mode;
122 }
123 ret = 0;
121 break; 124 break;
122 case ACL_TYPE_DEFAULT: 125 case ACL_TYPE_DEFAULT:
123 if (!S_ISDIR(inode->i_mode)) 126 if (!S_ISDIR(inode->i_mode))
@@ -242,6 +245,7 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
242 ACL_TYPE_ACCESS); 245 ACL_TYPE_ACCESS);
243 } 246 }
244 } 247 }
248 posix_acl_release(clone);
245 } 249 }
246failed: 250failed:
247 posix_acl_release(acl); 251 posix_acl_release(acl);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c0861e781cdb..462859a30141 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/kthread.h> 19#include <linux/kthread.h>
20#include <linux/slab.h>
20#include <linux/list.h> 21#include <linux/list.h>
21#include <linux/spinlock.h> 22#include <linux/spinlock.h>
22#include <linux/freezer.h> 23#include <linux/freezer.h>
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3f1f50d9d916..7a4dee199832 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -153,6 +153,11 @@ struct btrfs_inode {
153 unsigned ordered_data_close:1; 153 unsigned ordered_data_close:1;
154 unsigned dummy_inode:1; 154 unsigned dummy_inode:1;
155 155
156 /*
157 * always compress this one file
158 */
159 unsigned force_compress:1;
160
156 struct inode vfs_inode; 161 struct inode vfs_inode;
157}; 162};
158 163
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index a11a32058b50..396039b3a8a2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -31,7 +31,7 @@
31#include <linux/swap.h> 31#include <linux/swap.h>
32#include <linux/writeback.h> 32#include <linux/writeback.h>
33#include <linux/bit_spinlock.h> 33#include <linux/bit_spinlock.h>
34#include <linux/pagevec.h> 34#include <linux/slab.h>
35#include "compat.h" 35#include "compat.h"
36#include "ctree.h" 36#include "ctree.h"
37#include "disk-io.h" 37#include "disk-io.h"
@@ -445,7 +445,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
445 unsigned long nr_pages = 0; 445 unsigned long nr_pages = 0;
446 struct extent_map *em; 446 struct extent_map *em;
447 struct address_space *mapping = inode->i_mapping; 447 struct address_space *mapping = inode->i_mapping;
448 struct pagevec pvec;
449 struct extent_map_tree *em_tree; 448 struct extent_map_tree *em_tree;
450 struct extent_io_tree *tree; 449 struct extent_io_tree *tree;
451 u64 end; 450 u64 end;
@@ -461,7 +460,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
461 460
462 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 461 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
463 462
464 pagevec_init(&pvec, 0);
465 while (last_offset < compressed_end) { 463 while (last_offset < compressed_end) {
466 page_index = last_offset >> PAGE_CACHE_SHIFT; 464 page_index = last_offset >> PAGE_CACHE_SHIFT;
467 465
@@ -478,26 +476,17 @@ static noinline int add_ra_bio_pages(struct inode *inode,
478 goto next; 476 goto next;
479 } 477 }
480 478
481 page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS); 479 page = __page_cache_alloc(mapping_gfp_mask(mapping) &
480 ~__GFP_FS);
482 if (!page) 481 if (!page)
483 break; 482 break;
484 483
485 page->index = page_index; 484 if (add_to_page_cache_lru(page, mapping, page_index,
486 /* 485 GFP_NOFS)) {
487 * what we want to do here is call add_to_page_cache_lru,
488 * but that isn't exported, so we reproduce it here
489 */
490 if (add_to_page_cache(page, mapping,
491 page->index, GFP_NOFS)) {
492 page_cache_release(page); 486 page_cache_release(page);
493 goto next; 487 goto next;
494 } 488 }
495 489
496 /* open coding of lru_cache_add, also not exported */
497 page_cache_get(page);
498 if (!pagevec_add(&pvec, page))
499 __pagevec_lru_add_file(&pvec);
500
501 end = last_offset + PAGE_CACHE_SIZE - 1; 490 end = last_offset + PAGE_CACHE_SIZE - 1;
502 /* 491 /*
503 * at this point, we have a locked page in the page cache 492 * at this point, we have a locked page in the page cache
@@ -551,8 +540,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
551next: 540next:
552 last_offset += PAGE_CACHE_SIZE; 541 last_offset += PAGE_CACHE_SIZE;
553 } 542 }
554 if (pagevec_count(&pvec))
555 __pagevec_lru_add_file(&pvec);
556 return 0; 543 return 0;
557} 544}
558 545
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c4bc570a396e..6795a713b205 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "disk-io.h" 22#include "disk-io.h"
22#include "transaction.h" 23#include "transaction.h"
@@ -3040,6 +3041,10 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3040 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) 3041 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3041 goto err; 3042 goto err;
3042 3043
3044 /* the leaf has changed, it now has room. return now */
3045 if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
3046 goto err;
3047
3043 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3048 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3044 fi = btrfs_item_ptr(leaf, path->slots[0], 3049 fi = btrfs_item_ptr(leaf, path->slots[0],
3045 struct btrfs_file_extent_item); 3050 struct btrfs_file_extent_item);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9f806dd04c27..746a7248678e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -26,6 +26,7 @@
26#include <linux/completion.h> 26#include <linux/completion.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <linux/slab.h>
29#include <asm/kmap_types.h> 30#include <asm/kmap_types.h>
30#include "extent_io.h" 31#include "extent_io.h"
31#include "extent_map.h" 32#include "extent_map.h"
@@ -373,11 +374,13 @@ struct btrfs_super_block {
373 * ones specified below then we will fail to mount 374 * ones specified below then we will fail to mount
374 */ 375 */
375#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) 376#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
377#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (2ULL << 0)
376 378
377#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 379#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
378#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 380#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
379#define BTRFS_FEATURE_INCOMPAT_SUPP \ 381#define BTRFS_FEATURE_INCOMPAT_SUPP \
380 BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF 382 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
383 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)
381 384
382/* 385/*
383 * A leaf is full of items. offset and size tell us where to find 386 * A leaf is full of items. offset and size tell us where to find
@@ -832,7 +835,6 @@ struct btrfs_fs_info {
832 u64 last_trans_log_full_commit; 835 u64 last_trans_log_full_commit;
833 u64 open_ioctl_trans; 836 u64 open_ioctl_trans;
834 unsigned long mount_opt; 837 unsigned long mount_opt;
835 u64 max_extent;
836 u64 max_inline; 838 u64 max_inline;
837 u64 alloc_start; 839 u64 alloc_start;
838 struct btrfs_transaction *running_transaction; 840 struct btrfs_transaction *running_transaction;
@@ -1161,6 +1163,7 @@ struct btrfs_root {
1161#define BTRFS_MOUNT_SSD_SPREAD (1 << 8) 1163#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
1162#define BTRFS_MOUNT_NOSSD (1 << 9) 1164#define BTRFS_MOUNT_NOSSD (1 << 9)
1163#define BTRFS_MOUNT_DISCARD (1 << 10) 1165#define BTRFS_MOUNT_DISCARD (1 << 10)
1166#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11)
1164 1167
1165#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1168#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1166#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1169#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1181,7 +1184,6 @@ struct btrfs_root {
1181#define BTRFS_INODE_NOATIME (1 << 9) 1184#define BTRFS_INODE_NOATIME (1 << 9)
1182#define BTRFS_INODE_DIRSYNC (1 << 10) 1185#define BTRFS_INODE_DIRSYNC (1 << 10)
1183 1186
1184
1185/* some macros to generate set/get funcs for the struct fields. This 1187/* some macros to generate set/get funcs for the struct fields. This
1186 * assumes there is a lefoo_to_cpu for every type, so lets make a simple 1188 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
1187 * one for u8: 1189 * one for u8:
@@ -1841,7 +1843,7 @@ BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
1841BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, 1843BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
1842 compat_flags, 64); 1844 compat_flags, 64);
1843BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, 1845BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
1844 compat_flags, 64); 1846 compat_ro_flags, 64);
1845BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, 1847BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
1846 incompat_flags, 64); 1848 incompat_flags, 64);
1847BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, 1849BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
@@ -2309,7 +2311,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2309 u32 min_type); 2311 u32 min_type);
2310 2312
2311int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2313int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2312int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); 2314int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2315 struct extent_state **cached_state);
2313int btrfs_writepages(struct address_space *mapping, 2316int btrfs_writepages(struct address_space *mapping,
2314 struct writeback_control *wbc); 2317 struct writeback_control *wbc);
2315int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 2318int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -2325,7 +2328,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2325int btrfs_readpage(struct file *file, struct page *page); 2328int btrfs_readpage(struct file *file, struct page *page);
2326void btrfs_delete_inode(struct inode *inode); 2329void btrfs_delete_inode(struct inode *inode);
2327void btrfs_put_inode(struct inode *inode); 2330void btrfs_put_inode(struct inode *inode);
2328int btrfs_write_inode(struct inode *inode, int wait); 2331int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
2329void btrfs_dirty_inode(struct inode *inode); 2332void btrfs_dirty_inode(struct inode *inode);
2330struct inode *btrfs_alloc_inode(struct super_block *sb); 2333struct inode *btrfs_alloc_inode(struct super_block *sb);
2331void btrfs_destroy_inode(struct inode *inode); 2334void btrfs_destroy_inode(struct inode *inode);
@@ -2334,7 +2337,7 @@ int btrfs_init_cachep(void);
2334void btrfs_destroy_cachep(void); 2337void btrfs_destroy_cachep(void);
2335long btrfs_ioctl_trans_end(struct file *file); 2338long btrfs_ioctl_trans_end(struct file *file);
2336struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 2339struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
2337 struct btrfs_root *root); 2340 struct btrfs_root *root, int *was_new);
2338int btrfs_commit_write(struct file *file, struct page *page, 2341int btrfs_commit_write(struct file *file, struct page *page,
2339 unsigned from, unsigned to); 2342 unsigned from, unsigned to);
2340struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 2343struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2385,7 +2388,6 @@ void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
2385ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); 2388ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2386 2389
2387/* super.c */ 2390/* super.c */
2388u64 btrfs_parse_size(char *str);
2389int btrfs_parse_options(struct btrfs_root *root, char *options); 2391int btrfs_parse_options(struct btrfs_root *root, char *options);
2390int btrfs_sync_fs(struct super_block *sb, int wait); 2392int btrfs_sync_fs(struct super_block *sb, int wait);
2391 2393
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 84e6781413b1..902ce507c4e3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "delayed-ref.h" 23#include "delayed-ref.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 009e3bd18f23..feca04197d02 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -27,6 +27,7 @@
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h>
30#include "compat.h" 31#include "compat.h"
31#include "ctree.h" 32#include "ctree.h"
32#include "disk-io.h" 33#include "disk-io.h"
@@ -43,8 +44,6 @@ static struct extent_io_ops btree_extent_io_ops;
43static void end_workqueue_fn(struct btrfs_work *work); 44static void end_workqueue_fn(struct btrfs_work *work);
44static void free_fs_root(struct btrfs_root *root); 45static void free_fs_root(struct btrfs_root *root);
45 46
46static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
47
48/* 47/*
49 * end_io_wq structs are used to do processing in task context when an IO is 48 * end_io_wq structs are used to do processing in task context when an IO is
50 * complete. This is used during reads to verify checksums, and it is used 49 * complete. This is used during reads to verify checksums, and it is used
@@ -263,13 +262,15 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
263static int verify_parent_transid(struct extent_io_tree *io_tree, 262static int verify_parent_transid(struct extent_io_tree *io_tree,
264 struct extent_buffer *eb, u64 parent_transid) 263 struct extent_buffer *eb, u64 parent_transid)
265{ 264{
265 struct extent_state *cached_state = NULL;
266 int ret; 266 int ret;
267 267
268 if (!parent_transid || btrfs_header_generation(eb) == parent_transid) 268 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
269 return 0; 269 return 0;
270 270
271 lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); 271 lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
272 if (extent_buffer_uptodate(io_tree, eb) && 272 0, &cached_state, GFP_NOFS);
273 if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
273 btrfs_header_generation(eb) == parent_transid) { 274 btrfs_header_generation(eb) == parent_transid) {
274 ret = 0; 275 ret = 0;
275 goto out; 276 goto out;
@@ -282,10 +283,10 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
282 (unsigned long long)btrfs_header_generation(eb)); 283 (unsigned long long)btrfs_header_generation(eb));
283 } 284 }
284 ret = 1; 285 ret = 1;
285 clear_extent_buffer_uptodate(io_tree, eb); 286 clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
286out: 287out:
287 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, 288 unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
288 GFP_NOFS); 289 &cached_state, GFP_NOFS);
289 return ret; 290 return ret;
290} 291}
291 292
@@ -901,7 +902,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
901 root->highest_objectid = 0; 902 root->highest_objectid = 0;
902 root->name = NULL; 903 root->name = NULL;
903 root->in_sysfs = 0; 904 root->in_sysfs = 0;
904 root->inode_tree.rb_node = NULL; 905 root->inode_tree = RB_ROOT;
905 906
906 INIT_LIST_HEAD(&root->dirty_list); 907 INIT_LIST_HEAD(&root->dirty_list);
907 INIT_LIST_HEAD(&root->orphan_list); 908 INIT_LIST_HEAD(&root->orphan_list);
@@ -1372,19 +1373,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1372{ 1373{
1373 int err; 1374 int err;
1374 1375
1375 bdi->name = "btrfs";
1376 bdi->capabilities = BDI_CAP_MAP_COPY; 1376 bdi->capabilities = BDI_CAP_MAP_COPY;
1377 err = bdi_init(bdi); 1377 err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
1378 if (err) 1378 if (err)
1379 return err; 1379 return err;
1380 1380
1381 err = bdi_register(bdi, NULL, "btrfs-%d",
1382 atomic_inc_return(&btrfs_bdi_num));
1383 if (err) {
1384 bdi_destroy(bdi);
1385 return err;
1386 }
1387
1388 bdi->ra_pages = default_backing_dev_info.ra_pages; 1381 bdi->ra_pages = default_backing_dev_info.ra_pages;
1389 bdi->unplug_io_fn = btrfs_unplug_io_fn; 1382 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1390 bdi->unplug_io_data = info; 1383 bdi->unplug_io_data = info;
@@ -1632,7 +1625,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1632 atomic_set(&fs_info->async_submit_draining, 0); 1625 atomic_set(&fs_info->async_submit_draining, 0);
1633 atomic_set(&fs_info->nr_async_bios, 0); 1626 atomic_set(&fs_info->nr_async_bios, 0);
1634 fs_info->sb = sb; 1627 fs_info->sb = sb;
1635 fs_info->max_extent = (u64)-1;
1636 fs_info->max_inline = 8192 * 1024; 1628 fs_info->max_inline = 8192 * 1024;
1637 fs_info->metadata_ratio = 0; 1629 fs_info->metadata_ratio = 0;
1638 1630
@@ -1673,7 +1665,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1673 insert_inode_hash(fs_info->btree_inode); 1665 insert_inode_hash(fs_info->btree_inode);
1674 1666
1675 spin_lock_init(&fs_info->block_group_cache_lock); 1667 spin_lock_init(&fs_info->block_group_cache_lock);
1676 fs_info->block_group_cache_tree.rb_node = NULL; 1668 fs_info->block_group_cache_tree = RB_ROOT;
1677 1669
1678 extent_io_tree_init(&fs_info->freed_extents[0], 1670 extent_io_tree_init(&fs_info->freed_extents[0],
1679 fs_info->btree_inode->i_mapping, GFP_NOFS); 1671 fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -1920,7 +1912,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1920 1912
1921 csum_root->track_dirty = 1; 1913 csum_root->track_dirty = 1;
1922 1914
1923 btrfs_read_block_groups(extent_root); 1915 ret = btrfs_read_block_groups(extent_root);
1916 if (ret) {
1917 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
1918 goto fail_block_groups;
1919 }
1924 1920
1925 fs_info->generation = generation; 1921 fs_info->generation = generation;
1926 fs_info->last_trans_committed = generation; 1922 fs_info->last_trans_committed = generation;
@@ -1930,7 +1926,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1930 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1926 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1931 "btrfs-cleaner"); 1927 "btrfs-cleaner");
1932 if (IS_ERR(fs_info->cleaner_kthread)) 1928 if (IS_ERR(fs_info->cleaner_kthread))
1933 goto fail_csum_root; 1929 goto fail_block_groups;
1934 1930
1935 fs_info->transaction_kthread = kthread_run(transaction_kthread, 1931 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1936 tree_root, 1932 tree_root,
@@ -1982,7 +1978,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1982 1978
1983 if (!(sb->s_flags & MS_RDONLY)) { 1979 if (!(sb->s_flags & MS_RDONLY)) {
1984 ret = btrfs_recover_relocation(tree_root); 1980 ret = btrfs_recover_relocation(tree_root);
1985 BUG_ON(ret); 1981 if (ret < 0) {
1982 printk(KERN_WARNING
1983 "btrfs: failed to recover relocation\n");
1984 err = -EINVAL;
1985 goto fail_trans_kthread;
1986 }
1986 } 1987 }
1987 1988
1988 location.objectid = BTRFS_FS_TREE_OBJECTID; 1989 location.objectid = BTRFS_FS_TREE_OBJECTID;
@@ -1993,6 +1994,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1993 if (!fs_info->fs_root) 1994 if (!fs_info->fs_root)
1994 goto fail_trans_kthread; 1995 goto fail_trans_kthread;
1995 1996
1997 if (!(sb->s_flags & MS_RDONLY)) {
1998 down_read(&fs_info->cleanup_work_sem);
1999 btrfs_orphan_cleanup(fs_info->fs_root);
2000 up_read(&fs_info->cleanup_work_sem);
2001 }
2002
1996 return tree_root; 2003 return tree_root;
1997 2004
1998fail_trans_kthread: 2005fail_trans_kthread:
@@ -2007,7 +2014,8 @@ fail_cleaner:
2007 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 2014 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
2008 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2015 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2009 2016
2010fail_csum_root: 2017fail_block_groups:
2018 btrfs_free_block_groups(fs_info);
2011 free_extent_buffer(csum_root->node); 2019 free_extent_buffer(csum_root->node);
2012 free_extent_buffer(csum_root->commit_root); 2020 free_extent_buffer(csum_root->commit_root);
2013fail_dev_root: 2021fail_dev_root:
@@ -2486,7 +2494,8 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
2486 int ret; 2494 int ret;
2487 struct inode *btree_inode = buf->first_page->mapping->host; 2495 struct inode *btree_inode = buf->first_page->mapping->host;
2488 2496
2489 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); 2497 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
2498 NULL);
2490 if (!ret) 2499 if (!ret)
2491 return ret; 2500 return ret;
2492 2501
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ba5c3fd5ab8c..951ef09b82f4 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -95,7 +95,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
95 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 95 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
96 key.offset = 0; 96 key.offset = 0;
97 97
98 inode = btrfs_iget(sb, &key, root); 98 inode = btrfs_iget(sb, &key, root, NULL);
99 if (IS_ERR(inode)) { 99 if (IS_ERR(inode)) {
100 err = PTR_ERR(inode); 100 err = PTR_ERR(inode);
101 goto fail; 101 goto fail;
@@ -223,7 +223,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
223 223
224 key.type = BTRFS_INODE_ITEM_KEY; 224 key.type = BTRFS_INODE_ITEM_KEY;
225 key.offset = 0; 225 key.offset = 0;
226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); 226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
227 if (!IS_ERR(dentry)) 227 if (!IS_ERR(dentry))
228 dentry->d_op = &btrfs_dentry_operations; 228 dentry->d_op = &btrfs_dentry_operations;
229 return dentry; 229 return dentry;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 56e50137d0e6..b34d32fdaaec 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -22,6 +22,7 @@
22#include <linux/sort.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h>
25#include "compat.h" 26#include "compat.h"
26#include "hash.h" 27#include "hash.h"
27#include "ctree.h" 28#include "ctree.h"
@@ -83,6 +84,17 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
83 return (cache->flags & bits) == bits; 84 return (cache->flags & bits) == bits;
84} 85}
85 86
87void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
88{
89 atomic_inc(&cache->count);
90}
91
92void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
93{
94 if (atomic_dec_and_test(&cache->count))
95 kfree(cache);
96}
97
86/* 98/*
87 * this adds the block group to the fs_info rb tree for the block group 99 * this adds the block group to the fs_info rb tree for the block group
88 * cache 100 * cache
@@ -156,7 +168,7 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
156 } 168 }
157 } 169 }
158 if (ret) 170 if (ret)
159 atomic_inc(&ret->count); 171 btrfs_get_block_group(ret);
160 spin_unlock(&info->block_group_cache_lock); 172 spin_unlock(&info->block_group_cache_lock);
161 173
162 return ret; 174 return ret;
@@ -407,6 +419,8 @@ err:
407 419
408 put_caching_control(caching_ctl); 420 put_caching_control(caching_ctl);
409 atomic_dec(&block_group->space_info->caching_threads); 421 atomic_dec(&block_group->space_info->caching_threads);
422 btrfs_put_block_group(block_group);
423
410 return 0; 424 return 0;
411} 425}
412 426
@@ -447,6 +461,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache)
447 up_write(&fs_info->extent_commit_sem); 461 up_write(&fs_info->extent_commit_sem);
448 462
449 atomic_inc(&cache->space_info->caching_threads); 463 atomic_inc(&cache->space_info->caching_threads);
464 btrfs_get_block_group(cache);
450 465
451 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", 466 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
452 cache->key.objectid); 467 cache->key.objectid);
@@ -486,12 +501,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
486 return cache; 501 return cache;
487} 502}
488 503
489void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
490{
491 if (atomic_dec_and_test(&cache->count))
492 kfree(cache);
493}
494
495static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 504static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
496 u64 flags) 505 u64 flags)
497{ 506{
@@ -2582,7 +2591,7 @@ next_block_group(struct btrfs_root *root,
2582 if (node) { 2591 if (node) {
2583 cache = rb_entry(node, struct btrfs_block_group_cache, 2592 cache = rb_entry(node, struct btrfs_block_group_cache,
2584 cache_node); 2593 cache_node);
2585 atomic_inc(&cache->count); 2594 btrfs_get_block_group(cache);
2586 } else 2595 } else
2587 cache = NULL; 2596 cache = NULL;
2588 spin_unlock(&root->fs_info->block_group_cache_lock); 2597 spin_unlock(&root->fs_info->block_group_cache_lock);
@@ -2668,6 +2677,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2668 2677
2669 INIT_LIST_HEAD(&found->block_groups); 2678 INIT_LIST_HEAD(&found->block_groups);
2670 init_rwsem(&found->groups_sem); 2679 init_rwsem(&found->groups_sem);
2680 init_waitqueue_head(&found->flush_wait);
2681 init_waitqueue_head(&found->allocate_wait);
2671 spin_lock_init(&found->lock); 2682 spin_lock_init(&found->lock);
2672 found->flags = flags; 2683 found->flags = flags;
2673 found->total_bytes = total_bytes; 2684 found->total_bytes = total_bytes;
@@ -2838,7 +2849,7 @@ int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2838 } 2849 }
2839 spin_unlock(&BTRFS_I(inode)->accounting_lock); 2850 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2840 2851
2841 BTRFS_I(inode)->reserved_extents--; 2852 BTRFS_I(inode)->reserved_extents -= num_items;
2842 BUG_ON(BTRFS_I(inode)->reserved_extents < 0); 2853 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2843 2854
2844 if (meta_sinfo->bytes_delalloc < num_bytes) { 2855 if (meta_sinfo->bytes_delalloc < num_bytes) {
@@ -2936,12 +2947,10 @@ static void flush_delalloc(struct btrfs_root *root,
2936 2947
2937 spin_lock(&info->lock); 2948 spin_lock(&info->lock);
2938 2949
2939 if (!info->flushing) { 2950 if (!info->flushing)
2940 info->flushing = 1; 2951 info->flushing = 1;
2941 init_waitqueue_head(&info->flush_wait); 2952 else
2942 } else {
2943 wait = true; 2953 wait = true;
2944 }
2945 2954
2946 spin_unlock(&info->lock); 2955 spin_unlock(&info->lock);
2947 2956
@@ -3003,7 +3012,6 @@ static int maybe_allocate_chunk(struct btrfs_root *root,
3003 if (!info->allocating_chunk) { 3012 if (!info->allocating_chunk) {
3004 info->force_alloc = 1; 3013 info->force_alloc = 1;
3005 info->allocating_chunk = 1; 3014 info->allocating_chunk = 1;
3006 init_waitqueue_head(&info->allocate_wait);
3007 } else { 3015 } else {
3008 wait = true; 3016 wait = true;
3009 } 3017 }
@@ -3103,7 +3111,7 @@ again:
3103 return -ENOSPC; 3111 return -ENOSPC;
3104 } 3112 }
3105 3113
3106 BTRFS_I(inode)->reserved_extents++; 3114 BTRFS_I(inode)->reserved_extents += num_items;
3107 check_force_delalloc(meta_sinfo); 3115 check_force_delalloc(meta_sinfo);
3108 spin_unlock(&meta_sinfo->lock); 3116 spin_unlock(&meta_sinfo->lock);
3109 3117
@@ -3227,7 +3235,8 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3227 u64 bytes) 3235 u64 bytes)
3228{ 3236{
3229 struct btrfs_space_info *data_sinfo; 3237 struct btrfs_space_info *data_sinfo;
3230 int ret = 0, committed = 0; 3238 u64 used;
3239 int ret = 0, committed = 0, flushed = 0;
3231 3240
3232 /* make sure bytes are sectorsize aligned */ 3241 /* make sure bytes are sectorsize aligned */
3233 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3242 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -3239,12 +3248,21 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3239again: 3248again:
3240 /* make sure we have enough space to handle the data first */ 3249 /* make sure we have enough space to handle the data first */
3241 spin_lock(&data_sinfo->lock); 3250 spin_lock(&data_sinfo->lock);
3242 if (data_sinfo->total_bytes - data_sinfo->bytes_used - 3251 used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc +
3243 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - 3252 data_sinfo->bytes_reserved + data_sinfo->bytes_pinned +
3244 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - 3253 data_sinfo->bytes_readonly + data_sinfo->bytes_may_use +
3245 data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { 3254 data_sinfo->bytes_super;
3255
3256 if (used + bytes > data_sinfo->total_bytes) {
3246 struct btrfs_trans_handle *trans; 3257 struct btrfs_trans_handle *trans;
3247 3258
3259 if (!flushed) {
3260 spin_unlock(&data_sinfo->lock);
3261 flush_delalloc(root, data_sinfo);
3262 flushed = 1;
3263 goto again;
3264 }
3265
3248 /* 3266 /*
3249 * if we don't have enough free bytes in this space then we need 3267 * if we don't have enough free bytes in this space then we need
3250 * to alloc a new chunk. 3268 * to alloc a new chunk.
@@ -4162,6 +4180,10 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4162 ins->offset = 0; 4180 ins->offset = 0;
4163 4181
4164 space_info = __find_space_info(root->fs_info, data); 4182 space_info = __find_space_info(root->fs_info, data);
4183 if (!space_info) {
4184 printk(KERN_ERR "No space info for %d\n", data);
4185 return -ENOSPC;
4186 }
4165 4187
4166 if (orig_root->ref_cows || empty_size) 4188 if (orig_root->ref_cows || empty_size)
4167 allowed_chunk_alloc = 1; 4189 allowed_chunk_alloc = 1;
@@ -4227,7 +4249,7 @@ search:
4227 u64 offset; 4249 u64 offset;
4228 int cached; 4250 int cached;
4229 4251
4230 atomic_inc(&block_group->count); 4252 btrfs_get_block_group(block_group);
4231 search_start = block_group->key.objectid; 4253 search_start = block_group->key.objectid;
4232 4254
4233have_block_group: 4255have_block_group:
@@ -4315,7 +4337,7 @@ have_block_group:
4315 4337
4316 btrfs_put_block_group(block_group); 4338 btrfs_put_block_group(block_group);
4317 block_group = last_ptr->block_group; 4339 block_group = last_ptr->block_group;
4318 atomic_inc(&block_group->count); 4340 btrfs_get_block_group(block_group);
4319 spin_unlock(&last_ptr->lock); 4341 spin_unlock(&last_ptr->lock);
4320 spin_unlock(&last_ptr->refill_lock); 4342 spin_unlock(&last_ptr->refill_lock);
4321 4343
@@ -5197,6 +5219,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5197 next = btrfs_find_tree_block(root, bytenr, blocksize); 5219 next = btrfs_find_tree_block(root, bytenr, blocksize);
5198 if (!next) { 5220 if (!next) {
5199 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 5221 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
5222 if (!next)
5223 return -ENOMEM;
5200 reada = 1; 5224 reada = 1;
5201 } 5225 }
5202 btrfs_tree_lock(next); 5226 btrfs_tree_lock(next);
@@ -5394,10 +5418,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5394 int ret; 5418 int ret;
5395 5419
5396 while (level >= 0) { 5420 while (level >= 0) {
5397 if (path->slots[level] >=
5398 btrfs_header_nritems(path->nodes[level]))
5399 break;
5400
5401 ret = walk_down_proc(trans, root, path, wc, lookup_info); 5421 ret = walk_down_proc(trans, root, path, wc, lookup_info);
5402 if (ret > 0) 5422 if (ret > 0)
5403 break; 5423 break;
@@ -5405,11 +5425,16 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5405 if (level == 0) 5425 if (level == 0)
5406 break; 5426 break;
5407 5427
5428 if (path->slots[level] >=
5429 btrfs_header_nritems(path->nodes[level]))
5430 break;
5431
5408 ret = do_walk_down(trans, root, path, wc, &lookup_info); 5432 ret = do_walk_down(trans, root, path, wc, &lookup_info);
5409 if (ret > 0) { 5433 if (ret > 0) {
5410 path->slots[level]++; 5434 path->slots[level]++;
5411 continue; 5435 continue;
5412 } 5436 } else if (ret < 0)
5437 return ret;
5413 level = wc->level; 5438 level = wc->level;
5414 } 5439 }
5415 return 0; 5440 return 0;
@@ -6553,6 +6578,7 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
6553 struct btrfs_key key; 6578 struct btrfs_key key;
6554 struct inode *inode = NULL; 6579 struct inode *inode = NULL;
6555 struct btrfs_file_extent_item *fi; 6580 struct btrfs_file_extent_item *fi;
6581 struct extent_state *cached_state = NULL;
6556 u64 num_bytes; 6582 u64 num_bytes;
6557 u64 skip_objectid = 0; 6583 u64 skip_objectid = 0;
6558 u32 nritems; 6584 u32 nritems;
@@ -6581,12 +6607,14 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
6581 } 6607 }
6582 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 6608 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6583 6609
6584 lock_extent(&BTRFS_I(inode)->io_tree, key.offset, 6610 lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
6585 key.offset + num_bytes - 1, GFP_NOFS); 6611 key.offset + num_bytes - 1, 0, &cached_state,
6612 GFP_NOFS);
6586 btrfs_drop_extent_cache(inode, key.offset, 6613 btrfs_drop_extent_cache(inode, key.offset,
6587 key.offset + num_bytes - 1, 1); 6614 key.offset + num_bytes - 1, 1);
6588 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, 6615 unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
6589 key.offset + num_bytes - 1, GFP_NOFS); 6616 key.offset + num_bytes - 1, &cached_state,
6617 GFP_NOFS);
6590 cond_resched(); 6618 cond_resched();
6591 } 6619 }
6592 iput(inode); 6620 iput(inode);
@@ -7358,7 +7386,6 @@ static int find_first_block_group(struct btrfs_root *root,
7358 } 7386 }
7359 path->slots[0]++; 7387 path->slots[0]++;
7360 } 7388 }
7361 ret = -ENOENT;
7362out: 7389out:
7363 return ret; 7390 return ret;
7364} 7391}
@@ -7395,9 +7422,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7395 wait_block_group_cache_done(block_group); 7422 wait_block_group_cache_done(block_group);
7396 7423
7397 btrfs_remove_free_space_cache(block_group); 7424 btrfs_remove_free_space_cache(block_group);
7398 7425 btrfs_put_block_group(block_group);
7399 WARN_ON(atomic_read(&block_group->count) != 1);
7400 kfree(block_group);
7401 7426
7402 spin_lock(&info->block_group_cache_lock); 7427 spin_lock(&info->block_group_cache_lock);
7403 } 7428 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 96577e8bf9fd..d2d03684fab2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2,7 +2,6 @@
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/bio.h> 3#include <linux/bio.h>
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h> 5#include <linux/pagemap.h>
7#include <linux/page-flags.h> 6#include <linux/page-flags.h>
8#include <linux/module.h> 7#include <linux/module.h>
@@ -104,8 +103,8 @@ void extent_io_exit(void)
104void extent_io_tree_init(struct extent_io_tree *tree, 103void extent_io_tree_init(struct extent_io_tree *tree,
105 struct address_space *mapping, gfp_t mask) 104 struct address_space *mapping, gfp_t mask)
106{ 105{
107 tree->state.rb_node = NULL; 106 tree->state = RB_ROOT;
108 tree->buffer.rb_node = NULL; 107 tree->buffer = RB_ROOT;
109 tree->ops = NULL; 108 tree->ops = NULL;
110 tree->dirty_bytes = 0; 109 tree->dirty_bytes = 0;
111 spin_lock_init(&tree->lock); 110 spin_lock_init(&tree->lock);
@@ -513,7 +512,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
513 u64 last_end; 512 u64 last_end;
514 int err; 513 int err;
515 int set = 0; 514 int set = 0;
515 int clear = 0;
516 516
517 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
518 clear = 1;
517again: 519again:
518 if (!prealloc && (mask & __GFP_WAIT)) { 520 if (!prealloc && (mask & __GFP_WAIT)) {
519 prealloc = alloc_extent_state(mask); 521 prealloc = alloc_extent_state(mask);
@@ -524,14 +526,20 @@ again:
524 spin_lock(&tree->lock); 526 spin_lock(&tree->lock);
525 if (cached_state) { 527 if (cached_state) {
526 cached = *cached_state; 528 cached = *cached_state;
527 *cached_state = NULL; 529
528 cached_state = NULL; 530 if (clear) {
531 *cached_state = NULL;
532 cached_state = NULL;
533 }
534
529 if (cached && cached->tree && cached->start == start) { 535 if (cached && cached->tree && cached->start == start) {
530 atomic_dec(&cached->refs); 536 if (clear)
537 atomic_dec(&cached->refs);
531 state = cached; 538 state = cached;
532 goto hit_next; 539 goto hit_next;
533 } 540 }
534 free_extent_state(cached); 541 if (clear)
542 free_extent_state(cached);
535 } 543 }
536 /* 544 /*
537 * this search will find the extents that end after 545 * this search will find the extents that end after
@@ -946,11 +954,11 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
946} 954}
947 955
948int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 956int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
949 gfp_t mask) 957 struct extent_state **cached_state, gfp_t mask)
950{ 958{
951 return set_extent_bit(tree, start, end, 959 return set_extent_bit(tree, start, end,
952 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 960 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
953 0, NULL, NULL, mask); 961 0, NULL, cached_state, mask);
954} 962}
955 963
956int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 964int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -984,10 +992,11 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
984} 992}
985 993
986static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 994static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
987 u64 end, gfp_t mask) 995 u64 end, struct extent_state **cached_state,
996 gfp_t mask)
988{ 997{
989 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 998 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
990 NULL, mask); 999 cached_state, mask);
991} 1000}
992 1001
993int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1002int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -1171,7 +1180,8 @@ out:
1171 * 1 is returned if we find something, 0 if nothing was in the tree 1180 * 1 is returned if we find something, 0 if nothing was in the tree
1172 */ 1181 */
1173static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1182static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1174 u64 *start, u64 *end, u64 max_bytes) 1183 u64 *start, u64 *end, u64 max_bytes,
1184 struct extent_state **cached_state)
1175{ 1185{
1176 struct rb_node *node; 1186 struct rb_node *node;
1177 struct extent_state *state; 1187 struct extent_state *state;
@@ -1203,8 +1213,11 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1203 *end = state->end; 1213 *end = state->end;
1204 goto out; 1214 goto out;
1205 } 1215 }
1206 if (!found) 1216 if (!found) {
1207 *start = state->start; 1217 *start = state->start;
1218 *cached_state = state;
1219 atomic_inc(&state->refs);
1220 }
1208 found++; 1221 found++;
1209 *end = state->end; 1222 *end = state->end;
1210 cur_start = state->end + 1; 1223 cur_start = state->end + 1;
@@ -1336,10 +1349,11 @@ again:
1336 delalloc_start = *start; 1349 delalloc_start = *start;
1337 delalloc_end = 0; 1350 delalloc_end = 0;
1338 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1351 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1339 max_bytes); 1352 max_bytes, &cached_state);
1340 if (!found || delalloc_end <= *start) { 1353 if (!found || delalloc_end <= *start) {
1341 *start = delalloc_start; 1354 *start = delalloc_start;
1342 *end = delalloc_end; 1355 *end = delalloc_end;
1356 free_extent_state(cached_state);
1343 return found; 1357 return found;
1344 } 1358 }
1345 1359
@@ -1722,7 +1736,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
1722 } 1736 }
1723 1737
1724 if (!uptodate) { 1738 if (!uptodate) {
1725 clear_extent_uptodate(tree, start, end, GFP_NOFS); 1739 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
1726 ClearPageUptodate(page); 1740 ClearPageUptodate(page);
1727 SetPageError(page); 1741 SetPageError(page);
1728 } 1742 }
@@ -1750,7 +1764,8 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
1750static void end_bio_extent_readpage(struct bio *bio, int err) 1764static void end_bio_extent_readpage(struct bio *bio, int err)
1751{ 1765{
1752 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1766 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1753 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1767 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
1768 struct bio_vec *bvec = bio->bi_io_vec;
1754 struct extent_io_tree *tree; 1769 struct extent_io_tree *tree;
1755 u64 start; 1770 u64 start;
1756 u64 end; 1771 u64 end;
@@ -1773,7 +1788,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1773 else 1788 else
1774 whole_page = 0; 1789 whole_page = 0;
1775 1790
1776 if (--bvec >= bio->bi_io_vec) 1791 if (++bvec <= bvec_end)
1777 prefetchw(&bvec->bv_page->flags); 1792 prefetchw(&bvec->bv_page->flags);
1778 1793
1779 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1794 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
@@ -1818,7 +1833,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1818 } 1833 }
1819 check_page_locked(tree, page); 1834 check_page_locked(tree, page);
1820 } 1835 }
1821 } while (bvec >= bio->bi_io_vec); 1836 } while (bvec <= bvec_end);
1822 1837
1823 bio_put(bio); 1838 bio_put(bio);
1824} 1839}
@@ -2663,33 +2678,20 @@ int extent_readpages(struct extent_io_tree *tree,
2663{ 2678{
2664 struct bio *bio = NULL; 2679 struct bio *bio = NULL;
2665 unsigned page_idx; 2680 unsigned page_idx;
2666 struct pagevec pvec;
2667 unsigned long bio_flags = 0; 2681 unsigned long bio_flags = 0;
2668 2682
2669 pagevec_init(&pvec, 0);
2670 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2683 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2671 struct page *page = list_entry(pages->prev, struct page, lru); 2684 struct page *page = list_entry(pages->prev, struct page, lru);
2672 2685
2673 prefetchw(&page->flags); 2686 prefetchw(&page->flags);
2674 list_del(&page->lru); 2687 list_del(&page->lru);
2675 /* 2688 if (!add_to_page_cache_lru(page, mapping,
2676 * what we want to do here is call add_to_page_cache_lru,
2677 * but that isn't exported, so we reproduce it here
2678 */
2679 if (!add_to_page_cache(page, mapping,
2680 page->index, GFP_KERNEL)) { 2689 page->index, GFP_KERNEL)) {
2681
2682 /* open coding of lru_cache_add, also not exported */
2683 page_cache_get(page);
2684 if (!pagevec_add(&pvec, page))
2685 __pagevec_lru_add_file(&pvec);
2686 __extent_read_full_page(tree, page, get_extent, 2690 __extent_read_full_page(tree, page, get_extent,
2687 &bio, 0, &bio_flags); 2691 &bio, 0, &bio_flags);
2688 } 2692 }
2689 page_cache_release(page); 2693 page_cache_release(page);
2690 } 2694 }
2691 if (pagevec_count(&pvec))
2692 __pagevec_lru_add_file(&pvec);
2693 BUG_ON(!list_empty(pages)); 2695 BUG_ON(!list_empty(pages));
2694 if (bio) 2696 if (bio)
2695 submit_one_bio(READ, bio, 0, bio_flags); 2697 submit_one_bio(READ, bio, 0, bio_flags);
@@ -2704,6 +2706,7 @@ int extent_readpages(struct extent_io_tree *tree,
2704int extent_invalidatepage(struct extent_io_tree *tree, 2706int extent_invalidatepage(struct extent_io_tree *tree,
2705 struct page *page, unsigned long offset) 2707 struct page *page, unsigned long offset)
2706{ 2708{
2709 struct extent_state *cached_state = NULL;
2707 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 2710 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2708 u64 end = start + PAGE_CACHE_SIZE - 1; 2711 u64 end = start + PAGE_CACHE_SIZE - 1;
2709 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 2712 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
@@ -2712,12 +2715,12 @@ int extent_invalidatepage(struct extent_io_tree *tree,
2712 if (start > end) 2715 if (start > end)
2713 return 0; 2716 return 0;
2714 2717
2715 lock_extent(tree, start, end, GFP_NOFS); 2718 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
2716 wait_on_page_writeback(page); 2719 wait_on_page_writeback(page);
2717 clear_extent_bit(tree, start, end, 2720 clear_extent_bit(tree, start, end,
2718 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 2721 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2719 EXTENT_DO_ACCOUNTING, 2722 EXTENT_DO_ACCOUNTING,
2720 1, 1, NULL, GFP_NOFS); 2723 1, 1, &cached_state, GFP_NOFS);
2721 return 0; 2724 return 0;
2722} 2725}
2723 2726
@@ -2920,16 +2923,17 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2920 get_extent_t *get_extent) 2923 get_extent_t *get_extent)
2921{ 2924{
2922 struct inode *inode = mapping->host; 2925 struct inode *inode = mapping->host;
2926 struct extent_state *cached_state = NULL;
2923 u64 start = iblock << inode->i_blkbits; 2927 u64 start = iblock << inode->i_blkbits;
2924 sector_t sector = 0; 2928 sector_t sector = 0;
2925 size_t blksize = (1 << inode->i_blkbits); 2929 size_t blksize = (1 << inode->i_blkbits);
2926 struct extent_map *em; 2930 struct extent_map *em;
2927 2931
2928 lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2932 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2929 GFP_NOFS); 2933 0, &cached_state, GFP_NOFS);
2930 em = get_extent(inode, NULL, 0, start, blksize, 0); 2934 em = get_extent(inode, NULL, 0, start, blksize, 0);
2931 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2935 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
2932 GFP_NOFS); 2936 start + blksize - 1, &cached_state, GFP_NOFS);
2933 if (!em || IS_ERR(em)) 2937 if (!em || IS_ERR(em))
2934 return 0; 2938 return 0;
2935 2939
@@ -2951,6 +2955,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2951 u32 flags = 0; 2955 u32 flags = 0;
2952 u64 disko = 0; 2956 u64 disko = 0;
2953 struct extent_map *em = NULL; 2957 struct extent_map *em = NULL;
2958 struct extent_state *cached_state = NULL;
2954 int end = 0; 2959 int end = 0;
2955 u64 em_start = 0, em_len = 0; 2960 u64 em_start = 0, em_len = 0;
2956 unsigned long emflags; 2961 unsigned long emflags;
@@ -2959,8 +2964,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2959 if (len == 0) 2964 if (len == 0)
2960 return -EINVAL; 2965 return -EINVAL;
2961 2966
2962 lock_extent(&BTRFS_I(inode)->io_tree, start, start + len, 2967 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2963 GFP_NOFS); 2968 &cached_state, GFP_NOFS);
2964 em = get_extent(inode, NULL, 0, off, max - off, 0); 2969 em = get_extent(inode, NULL, 0, off, max - off, 0);
2965 if (!em) 2970 if (!em)
2966 goto out; 2971 goto out;
@@ -3023,8 +3028,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3023out_free: 3028out_free:
3024 free_extent_map(em); 3029 free_extent_map(em);
3025out: 3030out:
3026 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len, 3031 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3027 GFP_NOFS); 3032 &cached_state, GFP_NOFS);
3028 return ret; 3033 return ret;
3029} 3034}
3030 3035
@@ -3165,10 +3170,9 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3165 spin_unlock(&tree->buffer_lock); 3170 spin_unlock(&tree->buffer_lock);
3166 goto free_eb; 3171 goto free_eb;
3167 } 3172 }
3168 spin_unlock(&tree->buffer_lock);
3169
3170 /* add one reference for the tree */ 3173 /* add one reference for the tree */
3171 atomic_inc(&eb->refs); 3174 atomic_inc(&eb->refs);
3175 spin_unlock(&tree->buffer_lock);
3172 return eb; 3176 return eb;
3173 3177
3174free_eb: 3178free_eb:
@@ -3265,7 +3269,8 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
3265} 3269}
3266 3270
3267int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3271int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3268 struct extent_buffer *eb) 3272 struct extent_buffer *eb,
3273 struct extent_state **cached_state)
3269{ 3274{
3270 unsigned long i; 3275 unsigned long i;
3271 struct page *page; 3276 struct page *page;
@@ -3275,7 +3280,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3275 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3280 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3276 3281
3277 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3282 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3278 GFP_NOFS); 3283 cached_state, GFP_NOFS);
3279 for (i = 0; i < num_pages; i++) { 3284 for (i = 0; i < num_pages; i++) {
3280 page = extent_buffer_page(eb, i); 3285 page = extent_buffer_page(eb, i);
3281 if (page) 3286 if (page)
@@ -3335,7 +3340,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
3335} 3340}
3336 3341
3337int extent_buffer_uptodate(struct extent_io_tree *tree, 3342int extent_buffer_uptodate(struct extent_io_tree *tree,
3338 struct extent_buffer *eb) 3343 struct extent_buffer *eb,
3344 struct extent_state *cached_state)
3339{ 3345{
3340 int ret = 0; 3346 int ret = 0;
3341 unsigned long num_pages; 3347 unsigned long num_pages;
@@ -3347,7 +3353,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3347 return 1; 3353 return 1;
3348 3354
3349 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3355 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3350 EXTENT_UPTODATE, 1, NULL); 3356 EXTENT_UPTODATE, 1, cached_state);
3351 if (ret) 3357 if (ret)
3352 return ret; 3358 return ret;
3353 3359
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 36de250a7b2b..bbab4813646f 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -163,6 +163,8 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
163int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 163int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
164 int bits, struct extent_state **cached, gfp_t mask); 164 int bits, struct extent_state **cached, gfp_t mask);
165int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); 165int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
166int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
167 struct extent_state **cached, gfp_t mask);
166int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 168int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
167 gfp_t mask); 169 gfp_t mask);
168int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 170int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
@@ -196,7 +198,7 @@ int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
196int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start, 198int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
197 u64 end, gfp_t mask); 199 u64 end, gfp_t mask);
198int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 200int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
199 gfp_t mask); 201 struct extent_state **cached_state, gfp_t mask);
200int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, 202int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
201 gfp_t mask); 203 gfp_t mask);
202int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 204int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -281,9 +283,11 @@ int test_extent_buffer_dirty(struct extent_io_tree *tree,
281int set_extent_buffer_uptodate(struct extent_io_tree *tree, 283int set_extent_buffer_uptodate(struct extent_io_tree *tree,
282 struct extent_buffer *eb); 284 struct extent_buffer *eb);
283int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 285int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
284 struct extent_buffer *eb); 286 struct extent_buffer *eb,
287 struct extent_state **cached_state);
285int extent_buffer_uptodate(struct extent_io_tree *tree, 288int extent_buffer_uptodate(struct extent_io_tree *tree,
286 struct extent_buffer *eb); 289 struct extent_buffer *eb,
290 struct extent_state *cached_state);
287int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, 291int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
288 unsigned long min_len, char **token, char **map, 292 unsigned long min_len, char **token, char **map,
289 unsigned long *map_start, 293 unsigned long *map_start,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 46bea0f4dc7b..454ca52d6451 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,5 +1,4 @@
1#include <linux/err.h> 1#include <linux/err.h>
2#include <linux/gfp.h>
3#include <linux/slab.h> 2#include <linux/slab.h>
4#include <linux/module.h> 3#include <linux/module.h>
5#include <linux/spinlock.h> 4#include <linux/spinlock.h>
@@ -35,7 +34,7 @@ void extent_map_exit(void)
35 */ 34 */
36void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) 35void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
37{ 36{
38 tree->map.rb_node = NULL; 37 tree->map = RB_ROOT;
39 rwlock_init(&tree->lock); 38 rwlock_init(&tree->lock);
40} 39}
41 40
@@ -155,20 +154,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
155 return NULL; 154 return NULL;
156} 155}
157 156
158/*
159 * look for an offset in the tree, and if it can't be found, return
160 * the first offset we can find smaller than 'offset'.
161 */
162static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
163{
164 struct rb_node *prev;
165 struct rb_node *ret;
166 ret = __tree_search(root, offset, &prev, NULL);
167 if (!ret)
168 return prev;
169 return ret;
170}
171
172/* check to see if two extent_map structs are adjacent and safe to merge */ 157/* check to see if two extent_map structs are adjacent and safe to merge */
173static int mergable_maps(struct extent_map *prev, struct extent_map *next) 158static int mergable_maps(struct extent_map *prev, struct extent_map *next)
174{ 159{
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9b99886562d0..54a255065aa3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
21#include <linux/highmem.h> 22#include <linux/highmem.h>
22#include "ctree.h" 23#include "ctree.h"
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index feaa13b105d9..29ff749ff4ca 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -28,6 +28,7 @@
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/statfs.h> 29#include <linux/statfs.h>
30#include <linux/compat.h> 30#include <linux/compat.h>
31#include <linux/slab.h>
31#include "ctree.h" 32#include "ctree.h"
32#include "disk-io.h" 33#include "disk-io.h"
33#include "transaction.h" 34#include "transaction.h"
@@ -123,7 +124,8 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 124 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
124 125
125 end_of_last_block = start_pos + num_bytes - 1; 126 end_of_last_block = start_pos + num_bytes - 1;
126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); 127 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
128 NULL);
127 if (err) 129 if (err)
128 return err; 130 return err;
129 131
@@ -506,7 +508,8 @@ next_slot:
506} 508}
507 509
508static int extent_mergeable(struct extent_buffer *leaf, int slot, 510static int extent_mergeable(struct extent_buffer *leaf, int slot,
509 u64 objectid, u64 bytenr, u64 *start, u64 *end) 511 u64 objectid, u64 bytenr, u64 orig_offset,
512 u64 *start, u64 *end)
510{ 513{
511 struct btrfs_file_extent_item *fi; 514 struct btrfs_file_extent_item *fi;
512 struct btrfs_key key; 515 struct btrfs_key key;
@@ -522,6 +525,7 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot,
522 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 525 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
523 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || 526 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
524 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || 527 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
528 btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
525 btrfs_file_extent_compression(leaf, fi) || 529 btrfs_file_extent_compression(leaf, fi) ||
526 btrfs_file_extent_encryption(leaf, fi) || 530 btrfs_file_extent_encryption(leaf, fi) ||
527 btrfs_file_extent_other_encoding(leaf, fi)) 531 btrfs_file_extent_other_encoding(leaf, fi))
@@ -561,6 +565,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
561 u64 split; 565 u64 split;
562 int del_nr = 0; 566 int del_nr = 0;
563 int del_slot = 0; 567 int del_slot = 0;
568 int recow;
564 int ret; 569 int ret;
565 570
566 btrfs_drop_extent_cache(inode, start, end - 1, 0); 571 btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -568,6 +573,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
568 path = btrfs_alloc_path(); 573 path = btrfs_alloc_path();
569 BUG_ON(!path); 574 BUG_ON(!path);
570again: 575again:
576 recow = 0;
571 split = start; 577 split = start;
572 key.objectid = inode->i_ino; 578 key.objectid = inode->i_ino;
573 key.type = BTRFS_EXTENT_DATA_KEY; 579 key.type = BTRFS_EXTENT_DATA_KEY;
@@ -591,12 +597,60 @@ again:
591 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 597 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
592 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 598 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
593 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); 599 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
600 memcpy(&new_key, &key, sizeof(new_key));
601
602 if (start == key.offset && end < extent_end) {
603 other_start = 0;
604 other_end = start;
605 if (extent_mergeable(leaf, path->slots[0] - 1,
606 inode->i_ino, bytenr, orig_offset,
607 &other_start, &other_end)) {
608 new_key.offset = end;
609 btrfs_set_item_key_safe(trans, root, path, &new_key);
610 fi = btrfs_item_ptr(leaf, path->slots[0],
611 struct btrfs_file_extent_item);
612 btrfs_set_file_extent_num_bytes(leaf, fi,
613 extent_end - end);
614 btrfs_set_file_extent_offset(leaf, fi,
615 end - orig_offset);
616 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
617 struct btrfs_file_extent_item);
618 btrfs_set_file_extent_num_bytes(leaf, fi,
619 end - other_start);
620 btrfs_mark_buffer_dirty(leaf);
621 goto out;
622 }
623 }
624
625 if (start > key.offset && end == extent_end) {
626 other_start = end;
627 other_end = 0;
628 if (extent_mergeable(leaf, path->slots[0] + 1,
629 inode->i_ino, bytenr, orig_offset,
630 &other_start, &other_end)) {
631 fi = btrfs_item_ptr(leaf, path->slots[0],
632 struct btrfs_file_extent_item);
633 btrfs_set_file_extent_num_bytes(leaf, fi,
634 start - key.offset);
635 path->slots[0]++;
636 new_key.offset = start;
637 btrfs_set_item_key_safe(trans, root, path, &new_key);
638
639 fi = btrfs_item_ptr(leaf, path->slots[0],
640 struct btrfs_file_extent_item);
641 btrfs_set_file_extent_num_bytes(leaf, fi,
642 other_end - start);
643 btrfs_set_file_extent_offset(leaf, fi,
644 start - orig_offset);
645 btrfs_mark_buffer_dirty(leaf);
646 goto out;
647 }
648 }
594 649
595 while (start > key.offset || end < extent_end) { 650 while (start > key.offset || end < extent_end) {
596 if (key.offset == start) 651 if (key.offset == start)
597 split = end; 652 split = end;
598 653
599 memcpy(&new_key, &key, sizeof(new_key));
600 new_key.offset = split; 654 new_key.offset = split;
601 ret = btrfs_duplicate_item(trans, root, path, &new_key); 655 ret = btrfs_duplicate_item(trans, root, path, &new_key);
602 if (ret == -EAGAIN) { 656 if (ret == -EAGAIN) {
@@ -631,15 +685,18 @@ again:
631 path->slots[0]--; 685 path->slots[0]--;
632 extent_end = end; 686 extent_end = end;
633 } 687 }
688 recow = 1;
634 } 689 }
635 690
636 fi = btrfs_item_ptr(leaf, path->slots[0],
637 struct btrfs_file_extent_item);
638
639 other_start = end; 691 other_start = end;
640 other_end = 0; 692 other_end = 0;
641 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, 693 if (extent_mergeable(leaf, path->slots[0] + 1,
642 bytenr, &other_start, &other_end)) { 694 inode->i_ino, bytenr, orig_offset,
695 &other_start, &other_end)) {
696 if (recow) {
697 btrfs_release_path(root, path);
698 goto again;
699 }
643 extent_end = other_end; 700 extent_end = other_end;
644 del_slot = path->slots[0] + 1; 701 del_slot = path->slots[0] + 1;
645 del_nr++; 702 del_nr++;
@@ -650,8 +707,13 @@ again:
650 } 707 }
651 other_start = 0; 708 other_start = 0;
652 other_end = start; 709 other_end = start;
653 if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino, 710 if (extent_mergeable(leaf, path->slots[0] - 1,
654 bytenr, &other_start, &other_end)) { 711 inode->i_ino, bytenr, orig_offset,
712 &other_start, &other_end)) {
713 if (recow) {
714 btrfs_release_path(root, path);
715 goto again;
716 }
655 key.offset = other_start; 717 key.offset = other_start;
656 del_slot = path->slots[0]; 718 del_slot = path->slots[0];
657 del_nr++; 719 del_nr++;
@@ -661,21 +723,23 @@ again:
661 BUG_ON(ret); 723 BUG_ON(ret);
662 } 724 }
663 if (del_nr == 0) { 725 if (del_nr == 0) {
726 fi = btrfs_item_ptr(leaf, path->slots[0],
727 struct btrfs_file_extent_item);
664 btrfs_set_file_extent_type(leaf, fi, 728 btrfs_set_file_extent_type(leaf, fi,
665 BTRFS_FILE_EXTENT_REG); 729 BTRFS_FILE_EXTENT_REG);
666 btrfs_mark_buffer_dirty(leaf); 730 btrfs_mark_buffer_dirty(leaf);
667 goto out; 731 } else {
668 } 732 fi = btrfs_item_ptr(leaf, del_slot - 1,
669 733 struct btrfs_file_extent_item);
670 fi = btrfs_item_ptr(leaf, del_slot - 1, 734 btrfs_set_file_extent_type(leaf, fi,
671 struct btrfs_file_extent_item); 735 BTRFS_FILE_EXTENT_REG);
672 btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); 736 btrfs_set_file_extent_num_bytes(leaf, fi,
673 btrfs_set_file_extent_num_bytes(leaf, fi, 737 extent_end - key.offset);
674 extent_end - key.offset); 738 btrfs_mark_buffer_dirty(leaf);
675 btrfs_mark_buffer_dirty(leaf);
676 739
677 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 740 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
678 BUG_ON(ret); 741 BUG_ON(ret);
742 }
679out: 743out:
680 btrfs_free_path(path); 744 btrfs_free_path(path);
681 return 0; 745 return 0;
@@ -691,6 +755,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
691 loff_t pos, unsigned long first_index, 755 loff_t pos, unsigned long first_index,
692 unsigned long last_index, size_t write_bytes) 756 unsigned long last_index, size_t write_bytes)
693{ 757{
758 struct extent_state *cached_state = NULL;
694 int i; 759 int i;
695 unsigned long index = pos >> PAGE_CACHE_SHIFT; 760 unsigned long index = pos >> PAGE_CACHE_SHIFT;
696 struct inode *inode = fdentry(file)->d_inode; 761 struct inode *inode = fdentry(file)->d_inode;
@@ -719,16 +784,18 @@ again:
719 } 784 }
720 if (start_pos < inode->i_size) { 785 if (start_pos < inode->i_size) {
721 struct btrfs_ordered_extent *ordered; 786 struct btrfs_ordered_extent *ordered;
722 lock_extent(&BTRFS_I(inode)->io_tree, 787 lock_extent_bits(&BTRFS_I(inode)->io_tree,
723 start_pos, last_pos - 1, GFP_NOFS); 788 start_pos, last_pos - 1, 0, &cached_state,
789 GFP_NOFS);
724 ordered = btrfs_lookup_first_ordered_extent(inode, 790 ordered = btrfs_lookup_first_ordered_extent(inode,
725 last_pos - 1); 791 last_pos - 1);
726 if (ordered && 792 if (ordered &&
727 ordered->file_offset + ordered->len > start_pos && 793 ordered->file_offset + ordered->len > start_pos &&
728 ordered->file_offset < last_pos) { 794 ordered->file_offset < last_pos) {
729 btrfs_put_ordered_extent(ordered); 795 btrfs_put_ordered_extent(ordered);
730 unlock_extent(&BTRFS_I(inode)->io_tree, 796 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
731 start_pos, last_pos - 1, GFP_NOFS); 797 start_pos, last_pos - 1,
798 &cached_state, GFP_NOFS);
732 for (i = 0; i < num_pages; i++) { 799 for (i = 0; i < num_pages; i++) {
733 unlock_page(pages[i]); 800 unlock_page(pages[i]);
734 page_cache_release(pages[i]); 801 page_cache_release(pages[i]);
@@ -740,12 +807,13 @@ again:
740 if (ordered) 807 if (ordered)
741 btrfs_put_ordered_extent(ordered); 808 btrfs_put_ordered_extent(ordered);
742 809
743 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, 810 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
744 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 811 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
745 EXTENT_DO_ACCOUNTING, 812 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
746 GFP_NOFS); 813 GFP_NOFS);
747 unlock_extent(&BTRFS_I(inode)->io_tree, 814 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
748 start_pos, last_pos - 1, GFP_NOFS); 815 start_pos, last_pos - 1, &cached_state,
816 GFP_NOFS);
749 } 817 }
750 for (i = 0; i < num_pages; i++) { 818 for (i = 0; i < num_pages; i++) {
751 clear_page_dirty_for_io(pages[i]); 819 clear_page_dirty_for_io(pages[i]);
@@ -1073,7 +1141,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1073 } 1141 }
1074 mutex_lock(&dentry->d_inode->i_mutex); 1142 mutex_lock(&dentry->d_inode->i_mutex);
1075out: 1143out:
1076 return ret > 0 ? EIO : ret; 1144 return ret > 0 ? -EIO : ret;
1077} 1145}
1078 1146
1079static const struct vm_operations_struct btrfs_file_vm_ops = { 1147static const struct vm_operations_struct btrfs_file_vm_ops = {
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cb2849f03251..f488fac04d99 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21#include <linux/math64.h> 22#include <linux/math64.h>
22#include "ctree.h" 23#include "ctree.h"
23#include "free-space-cache.h" 24#include "free-space-cache.h"
@@ -870,7 +871,7 @@ __btrfs_return_cluster_to_free_space(
870 tree_insert_offset(&block_group->free_space_offset, 871 tree_insert_offset(&block_group->free_space_offset,
871 entry->offset, &entry->offset_index, 0); 872 entry->offset, &entry->offset_index, 0);
872 } 873 }
873 cluster->root.rb_node = NULL; 874 cluster->root = RB_ROOT;
874 875
875out: 876out:
876 spin_unlock(&cluster->lock); 877 spin_unlock(&cluster->lock);
@@ -1355,7 +1356,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
1355{ 1356{
1356 spin_lock_init(&cluster->lock); 1357 spin_lock_init(&cluster->lock);
1357 spin_lock_init(&cluster->refill_lock); 1358 spin_lock_init(&cluster->refill_lock);
1358 cluster->root.rb_node = NULL; 1359 cluster->root = RB_ROOT;
1359 cluster->max_size = 0; 1360 cluster->max_size = 0;
1360 cluster->points_to_bitmap = false; 1361 cluster->points_to_bitmap = false;
1361 INIT_LIST_HEAD(&cluster->block_group_list); 1362 INIT_LIST_HEAD(&cluster->block_group_list);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5440bab23635..2bfdc641d4e3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -36,6 +36,7 @@
36#include <linux/xattr.h> 36#include <linux/xattr.h>
37#include <linux/posix_acl.h> 37#include <linux/posix_acl.h>
38#include <linux/falloc.h> 38#include <linux/falloc.h>
39#include <linux/slab.h>
39#include "compat.h" 40#include "compat.h"
40#include "ctree.h" 41#include "ctree.h"
41#include "disk-io.h" 42#include "disk-io.h"
@@ -379,7 +380,8 @@ again:
379 * change at any time if we discover bad compression ratios. 380 * change at any time if we discover bad compression ratios.
380 */ 381 */
381 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 382 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
382 btrfs_test_opt(root, COMPRESS)) { 383 (btrfs_test_opt(root, COMPRESS) ||
384 (BTRFS_I(inode)->force_compress))) {
383 WARN_ON(pages); 385 WARN_ON(pages);
384 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 386 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
385 387
@@ -483,7 +485,10 @@ again:
483 nr_pages_ret = 0; 485 nr_pages_ret = 0;
484 486
485 /* flag the file so we don't compress in the future */ 487 /* flag the file so we don't compress in the future */
486 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 488 if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
489 !(BTRFS_I(inode)->force_compress)) {
490 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
491 }
487 } 492 }
488 if (will_compress) { 493 if (will_compress) {
489 *num_added += 1; 494 *num_added += 1;
@@ -569,8 +574,8 @@ retry:
569 unsigned long nr_written = 0; 574 unsigned long nr_written = 0;
570 575
571 lock_extent(io_tree, async_extent->start, 576 lock_extent(io_tree, async_extent->start,
572 async_extent->start + 577 async_extent->start +
573 async_extent->ram_size - 1, GFP_NOFS); 578 async_extent->ram_size - 1, GFP_NOFS);
574 579
575 /* allocate blocks */ 580 /* allocate blocks */
576 ret = cow_file_range(inode, async_cow->locked_page, 581 ret = cow_file_range(inode, async_cow->locked_page,
@@ -792,7 +797,7 @@ static noinline int cow_file_range(struct inode *inode,
792 while (disk_num_bytes > 0) { 797 while (disk_num_bytes > 0) {
793 unsigned long op; 798 unsigned long op;
794 799
795 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); 800 cur_alloc_size = disk_num_bytes;
796 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 801 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
797 root->sectorsize, 0, alloc_hint, 802 root->sectorsize, 0, alloc_hint,
798 (u64)-1, &ins, 1); 803 (u64)-1, &ins, 1);
@@ -1210,7 +1215,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1210 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) 1215 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
1211 ret = run_delalloc_nocow(inode, locked_page, start, end, 1216 ret = run_delalloc_nocow(inode, locked_page, start, end,
1212 page_started, 0, nr_written); 1217 page_started, 0, nr_written);
1213 else if (!btrfs_test_opt(root, COMPRESS)) 1218 else if (!btrfs_test_opt(root, COMPRESS) &&
1219 !(BTRFS_I(inode)->force_compress))
1214 ret = cow_file_range(inode, locked_page, start, end, 1220 ret = cow_file_range(inode, locked_page, start, end,
1215 page_started, nr_written, 1); 1221 page_started, nr_written, 1);
1216 else 1222 else
@@ -1222,30 +1228,9 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1222static int btrfs_split_extent_hook(struct inode *inode, 1228static int btrfs_split_extent_hook(struct inode *inode,
1223 struct extent_state *orig, u64 split) 1229 struct extent_state *orig, u64 split)
1224{ 1230{
1225 struct btrfs_root *root = BTRFS_I(inode)->root;
1226 u64 size;
1227
1228 if (!(orig->state & EXTENT_DELALLOC)) 1231 if (!(orig->state & EXTENT_DELALLOC))
1229 return 0; 1232 return 0;
1230 1233
1231 size = orig->end - orig->start + 1;
1232 if (size > root->fs_info->max_extent) {
1233 u64 num_extents;
1234 u64 new_size;
1235
1236 new_size = orig->end - split + 1;
1237 num_extents = div64_u64(size + root->fs_info->max_extent - 1,
1238 root->fs_info->max_extent);
1239
1240 /*
1241 * if we break a large extent up then leave oustanding_extents
1242 * be, since we've already accounted for the large extent.
1243 */
1244 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1245 root->fs_info->max_extent) < num_extents)
1246 return 0;
1247 }
1248
1249 spin_lock(&BTRFS_I(inode)->accounting_lock); 1234 spin_lock(&BTRFS_I(inode)->accounting_lock);
1250 BTRFS_I(inode)->outstanding_extents++; 1235 BTRFS_I(inode)->outstanding_extents++;
1251 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1236 spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -1263,38 +1248,10 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1263 struct extent_state *new, 1248 struct extent_state *new,
1264 struct extent_state *other) 1249 struct extent_state *other)
1265{ 1250{
1266 struct btrfs_root *root = BTRFS_I(inode)->root;
1267 u64 new_size, old_size;
1268 u64 num_extents;
1269
1270 /* not delalloc, ignore it */ 1251 /* not delalloc, ignore it */
1271 if (!(other->state & EXTENT_DELALLOC)) 1252 if (!(other->state & EXTENT_DELALLOC))
1272 return 0; 1253 return 0;
1273 1254
1274 old_size = other->end - other->start + 1;
1275 if (new->start < other->start)
1276 new_size = other->end - new->start + 1;
1277 else
1278 new_size = new->end - other->start + 1;
1279
1280 /* we're not bigger than the max, unreserve the space and go */
1281 if (new_size <= root->fs_info->max_extent) {
1282 spin_lock(&BTRFS_I(inode)->accounting_lock);
1283 BTRFS_I(inode)->outstanding_extents--;
1284 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1285 return 0;
1286 }
1287
1288 /*
1289 * If we grew by another max_extent, just return, we want to keep that
1290 * reserved amount.
1291 */
1292 num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
1293 root->fs_info->max_extent);
1294 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1295 root->fs_info->max_extent) > num_extents)
1296 return 0;
1297
1298 spin_lock(&BTRFS_I(inode)->accounting_lock); 1255 spin_lock(&BTRFS_I(inode)->accounting_lock);
1299 BTRFS_I(inode)->outstanding_extents--; 1256 BTRFS_I(inode)->outstanding_extents--;
1300 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1257 spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -1323,6 +1280,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1323 BTRFS_I(inode)->outstanding_extents++; 1280 BTRFS_I(inode)->outstanding_extents++;
1324 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1281 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1325 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1282 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1283
1326 spin_lock(&root->fs_info->delalloc_lock); 1284 spin_lock(&root->fs_info->delalloc_lock);
1327 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1285 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1328 root->fs_info->delalloc_bytes += end - start + 1; 1286 root->fs_info->delalloc_bytes += end - start + 1;
@@ -1351,6 +1309,7 @@ static int btrfs_clear_bit_hook(struct inode *inode,
1351 1309
1352 if (bits & EXTENT_DO_ACCOUNTING) { 1310 if (bits & EXTENT_DO_ACCOUNTING) {
1353 spin_lock(&BTRFS_I(inode)->accounting_lock); 1311 spin_lock(&BTRFS_I(inode)->accounting_lock);
1312 WARN_ON(!BTRFS_I(inode)->outstanding_extents);
1354 BTRFS_I(inode)->outstanding_extents--; 1313 BTRFS_I(inode)->outstanding_extents--;
1355 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1314 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1356 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 1315 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
@@ -1507,12 +1466,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1507 return 0; 1466 return 0;
1508} 1467}
1509 1468
1510int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) 1469int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1470 struct extent_state **cached_state)
1511{ 1471{
1512 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1472 if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
1513 WARN_ON(1); 1473 WARN_ON(1);
1514 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1474 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1515 GFP_NOFS); 1475 cached_state, GFP_NOFS);
1516} 1476}
1517 1477
1518/* see btrfs_writepage_start_hook for details on why this is required */ 1478/* see btrfs_writepage_start_hook for details on why this is required */
@@ -1525,6 +1485,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1525{ 1485{
1526 struct btrfs_writepage_fixup *fixup; 1486 struct btrfs_writepage_fixup *fixup;
1527 struct btrfs_ordered_extent *ordered; 1487 struct btrfs_ordered_extent *ordered;
1488 struct extent_state *cached_state = NULL;
1528 struct page *page; 1489 struct page *page;
1529 struct inode *inode; 1490 struct inode *inode;
1530 u64 page_start; 1491 u64 page_start;
@@ -1543,7 +1504,8 @@ again:
1543 page_start = page_offset(page); 1504 page_start = page_offset(page);
1544 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1505 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1545 1506
1546 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1507 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1508 &cached_state, GFP_NOFS);
1547 1509
1548 /* already ordered? We're done */ 1510 /* already ordered? We're done */
1549 if (PagePrivate2(page)) 1511 if (PagePrivate2(page))
@@ -1551,17 +1513,18 @@ again:
1551 1513
1552 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1514 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1553 if (ordered) { 1515 if (ordered) {
1554 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, 1516 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1555 page_end, GFP_NOFS); 1517 page_end, &cached_state, GFP_NOFS);
1556 unlock_page(page); 1518 unlock_page(page);
1557 btrfs_start_ordered_extent(inode, ordered, 1); 1519 btrfs_start_ordered_extent(inode, ordered, 1);
1558 goto again; 1520 goto again;
1559 } 1521 }
1560 1522
1561 btrfs_set_extent_delalloc(inode, page_start, page_end); 1523 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1562 ClearPageChecked(page); 1524 ClearPageChecked(page);
1563out: 1525out:
1564 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1526 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1527 &cached_state, GFP_NOFS);
1565out_page: 1528out_page:
1566 unlock_page(page); 1529 unlock_page(page);
1567 page_cache_release(page); 1530 page_cache_release(page);
@@ -1680,24 +1643,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1680 * before we start the transaction. It limits the amount of btree 1643 * before we start the transaction. It limits the amount of btree
1681 * reads required while inside the transaction. 1644 * reads required while inside the transaction.
1682 */ 1645 */
1683static noinline void reada_csum(struct btrfs_root *root,
1684 struct btrfs_path *path,
1685 struct btrfs_ordered_extent *ordered_extent)
1686{
1687 struct btrfs_ordered_sum *sum;
1688 u64 bytenr;
1689
1690 sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
1691 list);
1692 bytenr = sum->sums[0].bytenr;
1693
1694 /*
1695 * we don't care about the results, the point of this search is
1696 * just to get the btree leaves into ram
1697 */
1698 btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
1699}
1700
1701/* as ordered data IO finishes, this gets called so we can finish 1646/* as ordered data IO finishes, this gets called so we can finish
1702 * an ordered extent if the range of bytes in the file it covers are 1647 * an ordered extent if the range of bytes in the file it covers are
1703 * fully written. 1648 * fully written.
@@ -1708,40 +1653,16 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1708 struct btrfs_trans_handle *trans; 1653 struct btrfs_trans_handle *trans;
1709 struct btrfs_ordered_extent *ordered_extent = NULL; 1654 struct btrfs_ordered_extent *ordered_extent = NULL;
1710 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1655 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1711 struct btrfs_path *path; 1656 struct extent_state *cached_state = NULL;
1712 int compressed = 0; 1657 int compressed = 0;
1713 int ret; 1658 int ret;
1714 1659
1715 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1); 1660 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
1661 end - start + 1);
1716 if (!ret) 1662 if (!ret)
1717 return 0; 1663 return 0;
1718
1719 /*
1720 * before we join the transaction, try to do some of our IO.
1721 * This will limit the amount of IO that we have to do with
1722 * the transaction running. We're unlikely to need to do any
1723 * IO if the file extents are new, the disk_i_size checks
1724 * covers the most common case.
1725 */
1726 if (start < BTRFS_I(inode)->disk_i_size) {
1727 path = btrfs_alloc_path();
1728 if (path) {
1729 ret = btrfs_lookup_file_extent(NULL, root, path,
1730 inode->i_ino,
1731 start, 0);
1732 ordered_extent = btrfs_lookup_ordered_extent(inode,
1733 start);
1734 if (!list_empty(&ordered_extent->list)) {
1735 btrfs_release_path(root, path);
1736 reada_csum(root, path, ordered_extent);
1737 }
1738 btrfs_free_path(path);
1739 }
1740 }
1741
1742 if (!ordered_extent)
1743 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1744 BUG_ON(!ordered_extent); 1664 BUG_ON(!ordered_extent);
1665
1745 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1666 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1746 BUG_ON(!list_empty(&ordered_extent->list)); 1667 BUG_ON(!list_empty(&ordered_extent->list));
1747 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1668 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
@@ -1754,9 +1675,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1754 goto out; 1675 goto out;
1755 } 1676 }
1756 1677
1757 lock_extent(io_tree, ordered_extent->file_offset, 1678 lock_extent_bits(io_tree, ordered_extent->file_offset,
1758 ordered_extent->file_offset + ordered_extent->len - 1, 1679 ordered_extent->file_offset + ordered_extent->len - 1,
1759 GFP_NOFS); 1680 0, &cached_state, GFP_NOFS);
1760 1681
1761 trans = btrfs_join_transaction(root, 1); 1682 trans = btrfs_join_transaction(root, 1);
1762 1683
@@ -1783,9 +1704,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1783 ordered_extent->len); 1704 ordered_extent->len);
1784 BUG_ON(ret); 1705 BUG_ON(ret);
1785 } 1706 }
1786 unlock_extent(io_tree, ordered_extent->file_offset, 1707 unlock_extent_cached(io_tree, ordered_extent->file_offset,
1787 ordered_extent->file_offset + ordered_extent->len - 1, 1708 ordered_extent->file_offset +
1788 GFP_NOFS); 1709 ordered_extent->len - 1, &cached_state, GFP_NOFS);
1710
1789 add_pending_csums(trans, inode, ordered_extent->file_offset, 1711 add_pending_csums(trans, inode, ordered_extent->file_offset,
1790 &ordered_extent->list); 1712 &ordered_extent->list);
1791 1713
@@ -2194,7 +2116,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2194 found_key.objectid = found_key.offset; 2116 found_key.objectid = found_key.offset;
2195 found_key.type = BTRFS_INODE_ITEM_KEY; 2117 found_key.type = BTRFS_INODE_ITEM_KEY;
2196 found_key.offset = 0; 2118 found_key.offset = 0;
2197 inode = btrfs_iget(root->fs_info->sb, &found_key, root); 2119 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2198 if (IS_ERR(inode)) 2120 if (IS_ERR(inode))
2199 break; 2121 break;
2200 2122
@@ -3122,6 +3044,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3122 struct btrfs_root *root = BTRFS_I(inode)->root; 3044 struct btrfs_root *root = BTRFS_I(inode)->root;
3123 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3045 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3124 struct btrfs_ordered_extent *ordered; 3046 struct btrfs_ordered_extent *ordered;
3047 struct extent_state *cached_state = NULL;
3125 char *kaddr; 3048 char *kaddr;
3126 u32 blocksize = root->sectorsize; 3049 u32 blocksize = root->sectorsize;
3127 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3050 pgoff_t index = from >> PAGE_CACHE_SHIFT;
@@ -3168,12 +3091,14 @@ again:
3168 } 3091 }
3169 wait_on_page_writeback(page); 3092 wait_on_page_writeback(page);
3170 3093
3171 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 3094 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
3095 GFP_NOFS);
3172 set_page_extent_mapped(page); 3096 set_page_extent_mapped(page);
3173 3097
3174 ordered = btrfs_lookup_ordered_extent(inode, page_start); 3098 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3175 if (ordered) { 3099 if (ordered) {
3176 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3100 unlock_extent_cached(io_tree, page_start, page_end,
3101 &cached_state, GFP_NOFS);
3177 unlock_page(page); 3102 unlock_page(page);
3178 page_cache_release(page); 3103 page_cache_release(page);
3179 btrfs_start_ordered_extent(inode, ordered, 1); 3104 btrfs_start_ordered_extent(inode, ordered, 1);
@@ -3181,13 +3106,15 @@ again:
3181 goto again; 3106 goto again;
3182 } 3107 }
3183 3108
3184 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 3109 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
3185 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3110 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
3186 GFP_NOFS); 3111 0, 0, &cached_state, GFP_NOFS);
3187 3112
3188 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 3113 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
3114 &cached_state);
3189 if (ret) { 3115 if (ret) {
3190 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3116 unlock_extent_cached(io_tree, page_start, page_end,
3117 &cached_state, GFP_NOFS);
3191 goto out_unlock; 3118 goto out_unlock;
3192 } 3119 }
3193 3120
@@ -3200,7 +3127,8 @@ again:
3200 } 3127 }
3201 ClearPageChecked(page); 3128 ClearPageChecked(page);
3202 set_page_dirty(page); 3129 set_page_dirty(page);
3203 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3130 unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
3131 GFP_NOFS);
3204 3132
3205out_unlock: 3133out_unlock:
3206 if (ret) 3134 if (ret)
@@ -3218,6 +3146,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3218 struct btrfs_root *root = BTRFS_I(inode)->root; 3146 struct btrfs_root *root = BTRFS_I(inode)->root;
3219 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3147 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3220 struct extent_map *em; 3148 struct extent_map *em;
3149 struct extent_state *cached_state = NULL;
3221 u64 mask = root->sectorsize - 1; 3150 u64 mask = root->sectorsize - 1;
3222 u64 hole_start = (inode->i_size + mask) & ~mask; 3151 u64 hole_start = (inode->i_size + mask) & ~mask;
3223 u64 block_end = (size + mask) & ~mask; 3152 u64 block_end = (size + mask) & ~mask;
@@ -3233,11 +3162,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3233 struct btrfs_ordered_extent *ordered; 3162 struct btrfs_ordered_extent *ordered;
3234 btrfs_wait_ordered_range(inode, hole_start, 3163 btrfs_wait_ordered_range(inode, hole_start,
3235 block_end - hole_start); 3164 block_end - hole_start);
3236 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3165 lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
3166 &cached_state, GFP_NOFS);
3237 ordered = btrfs_lookup_ordered_extent(inode, hole_start); 3167 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
3238 if (!ordered) 3168 if (!ordered)
3239 break; 3169 break;
3240 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3170 unlock_extent_cached(io_tree, hole_start, block_end - 1,
3171 &cached_state, GFP_NOFS);
3241 btrfs_put_ordered_extent(ordered); 3172 btrfs_put_ordered_extent(ordered);
3242 } 3173 }
3243 3174
@@ -3282,7 +3213,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3282 break; 3213 break;
3283 } 3214 }
3284 3215
3285 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3216 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
3217 GFP_NOFS);
3286 return err; 3218 return err;
3287} 3219}
3288 3220
@@ -3680,6 +3612,7 @@ static noinline void init_btrfs_i(struct inode *inode)
3680 bi->index_cnt = (u64)-1; 3612 bi->index_cnt = (u64)-1;
3681 bi->last_unlink_trans = 0; 3613 bi->last_unlink_trans = 0;
3682 bi->ordered_data_close = 0; 3614 bi->ordered_data_close = 0;
3615 bi->force_compress = 0;
3683 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); 3616 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3684 extent_io_tree_init(&BTRFS_I(inode)->io_tree, 3617 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3685 inode->i_mapping, GFP_NOFS); 3618 inode->i_mapping, GFP_NOFS);
@@ -3728,7 +3661,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
3728 * Returns in *is_new if the inode was read from disk 3661 * Returns in *is_new if the inode was read from disk
3729 */ 3662 */
3730struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 3663struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3731 struct btrfs_root *root) 3664 struct btrfs_root *root, int *new)
3732{ 3665{
3733 struct inode *inode; 3666 struct inode *inode;
3734 3667
@@ -3743,6 +3676,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3743 3676
3744 inode_tree_add(inode); 3677 inode_tree_add(inode);
3745 unlock_new_inode(inode); 3678 unlock_new_inode(inode);
3679 if (new)
3680 *new = 1;
3746 } 3681 }
3747 3682
3748 return inode; 3683 return inode;
@@ -3795,7 +3730,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3795 return NULL; 3730 return NULL;
3796 3731
3797 if (location.type == BTRFS_INODE_ITEM_KEY) { 3732 if (location.type == BTRFS_INODE_ITEM_KEY) {
3798 inode = btrfs_iget(dir->i_sb, &location, root); 3733 inode = btrfs_iget(dir->i_sb, &location, root, NULL);
3799 return inode; 3734 return inode;
3800 } 3735 }
3801 3736
@@ -3810,7 +3745,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3810 else 3745 else
3811 inode = new_simple_dir(dir->i_sb, &location, sub_root); 3746 inode = new_simple_dir(dir->i_sb, &location, sub_root);
3812 } else { 3747 } else {
3813 inode = btrfs_iget(dir->i_sb, &location, sub_root); 3748 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
3814 } 3749 }
3815 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 3750 srcu_read_unlock(&root->fs_info->subvol_srcu, index);
3816 3751
@@ -3995,7 +3930,11 @@ skip:
3995 3930
3996 /* Reached end of directory/root. Bump pos past the last item. */ 3931 /* Reached end of directory/root. Bump pos past the last item. */
3997 if (key_type == BTRFS_DIR_INDEX_KEY) 3932 if (key_type == BTRFS_DIR_INDEX_KEY)
3998 filp->f_pos = INT_LIMIT(off_t); 3933 /*
3934 * 32-bit glibc will use getdents64, but then strtol -
3935 * so the last number we can serve is this.
3936 */
3937 filp->f_pos = 0x7fffffff;
3999 else 3938 else
4000 filp->f_pos++; 3939 filp->f_pos++;
4001nopos: 3940nopos:
@@ -4005,7 +3944,7 @@ err:
4005 return ret; 3944 return ret;
4006} 3945}
4007 3946
4008int btrfs_write_inode(struct inode *inode, int wait) 3947int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4009{ 3948{
4010 struct btrfs_root *root = BTRFS_I(inode)->root; 3949 struct btrfs_root *root = BTRFS_I(inode)->root;
4011 struct btrfs_trans_handle *trans; 3950 struct btrfs_trans_handle *trans;
@@ -4014,7 +3953,7 @@ int btrfs_write_inode(struct inode *inode, int wait)
4014 if (root->fs_info->btree_inode == inode) 3953 if (root->fs_info->btree_inode == inode)
4015 return 0; 3954 return 0;
4016 3955
4017 if (wait) { 3956 if (wbc->sync_mode == WB_SYNC_ALL) {
4018 trans = btrfs_join_transaction(root, 1); 3957 trans = btrfs_join_transaction(root, 1);
4019 btrfs_set_trans_block_group(trans, inode); 3958 btrfs_set_trans_block_group(trans, inode);
4020 ret = btrfs_commit_transaction(trans, root); 3959 ret = btrfs_commit_transaction(trans, root);
@@ -4538,7 +4477,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4538 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4477 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4539 if (err) { 4478 if (err) {
4540 err = -ENOSPC; 4479 err = -ENOSPC;
4541 goto out_unlock; 4480 goto out_fail;
4542 } 4481 }
4543 4482
4544 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4483 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
@@ -5016,6 +4955,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5016{ 4955{
5017 struct extent_io_tree *tree; 4956 struct extent_io_tree *tree;
5018 struct btrfs_ordered_extent *ordered; 4957 struct btrfs_ordered_extent *ordered;
4958 struct extent_state *cached_state = NULL;
5019 u64 page_start = page_offset(page); 4959 u64 page_start = page_offset(page);
5020 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 4960 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
5021 4961
@@ -5034,7 +4974,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5034 btrfs_releasepage(page, GFP_NOFS); 4974 btrfs_releasepage(page, GFP_NOFS);
5035 return; 4975 return;
5036 } 4976 }
5037 lock_extent(tree, page_start, page_end, GFP_NOFS); 4977 lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
4978 GFP_NOFS);
5038 ordered = btrfs_lookup_ordered_extent(page->mapping->host, 4979 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
5039 page_offset(page)); 4980 page_offset(page));
5040 if (ordered) { 4981 if (ordered) {
@@ -5045,7 +4986,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5045 clear_extent_bit(tree, page_start, page_end, 4986 clear_extent_bit(tree, page_start, page_end,
5046 EXTENT_DIRTY | EXTENT_DELALLOC | 4987 EXTENT_DIRTY | EXTENT_DELALLOC |
5047 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 4988 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
5048 NULL, GFP_NOFS); 4989 &cached_state, GFP_NOFS);
5049 /* 4990 /*
5050 * whoever cleared the private bit is responsible 4991 * whoever cleared the private bit is responsible
5051 * for the finish_ordered_io 4992 * for the finish_ordered_io
@@ -5055,11 +4996,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5055 page_start, page_end); 4996 page_start, page_end);
5056 } 4997 }
5057 btrfs_put_ordered_extent(ordered); 4998 btrfs_put_ordered_extent(ordered);
5058 lock_extent(tree, page_start, page_end, GFP_NOFS); 4999 cached_state = NULL;
5000 lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
5001 GFP_NOFS);
5059 } 5002 }
5060 clear_extent_bit(tree, page_start, page_end, 5003 clear_extent_bit(tree, page_start, page_end,
5061 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 5004 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
5062 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); 5005 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
5063 __btrfs_releasepage(page, GFP_NOFS); 5006 __btrfs_releasepage(page, GFP_NOFS);
5064 5007
5065 ClearPageChecked(page); 5008 ClearPageChecked(page);
@@ -5092,6 +5035,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5092 struct btrfs_root *root = BTRFS_I(inode)->root; 5035 struct btrfs_root *root = BTRFS_I(inode)->root;
5093 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5036 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5094 struct btrfs_ordered_extent *ordered; 5037 struct btrfs_ordered_extent *ordered;
5038 struct extent_state *cached_state = NULL;
5095 char *kaddr; 5039 char *kaddr;
5096 unsigned long zero_start; 5040 unsigned long zero_start;
5097 loff_t size; 5041 loff_t size;
@@ -5130,7 +5074,8 @@ again:
5130 } 5074 }
5131 wait_on_page_writeback(page); 5075 wait_on_page_writeback(page);
5132 5076
5133 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 5077 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
5078 GFP_NOFS);
5134 set_page_extent_mapped(page); 5079 set_page_extent_mapped(page);
5135 5080
5136 /* 5081 /*
@@ -5139,7 +5084,8 @@ again:
5139 */ 5084 */
5140 ordered = btrfs_lookup_ordered_extent(inode, page_start); 5085 ordered = btrfs_lookup_ordered_extent(inode, page_start);
5141 if (ordered) { 5086 if (ordered) {
5142 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5087 unlock_extent_cached(io_tree, page_start, page_end,
5088 &cached_state, GFP_NOFS);
5143 unlock_page(page); 5089 unlock_page(page);
5144 btrfs_start_ordered_extent(inode, ordered, 1); 5090 btrfs_start_ordered_extent(inode, ordered, 1);
5145 btrfs_put_ordered_extent(ordered); 5091 btrfs_put_ordered_extent(ordered);
@@ -5153,13 +5099,15 @@ again:
5153 * is probably a better way to do this, but for now keep consistent with 5099 * is probably a better way to do this, but for now keep consistent with
5154 * prepare_pages in the normal write path. 5100 * prepare_pages in the normal write path.
5155 */ 5101 */
5156 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 5102 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
5157 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 5103 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
5158 GFP_NOFS); 5104 0, 0, &cached_state, GFP_NOFS);
5159 5105
5160 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 5106 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
5107 &cached_state);
5161 if (ret) { 5108 if (ret) {
5162 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5109 unlock_extent_cached(io_tree, page_start, page_end,
5110 &cached_state, GFP_NOFS);
5163 ret = VM_FAULT_SIGBUS; 5111 ret = VM_FAULT_SIGBUS;
5164 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 5112 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5165 goto out_unlock; 5113 goto out_unlock;
@@ -5185,7 +5133,7 @@ again:
5185 BTRFS_I(inode)->last_trans = root->fs_info->generation; 5133 BTRFS_I(inode)->last_trans = root->fs_info->generation;
5186 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 5134 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
5187 5135
5188 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5136 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
5189 5137
5190out_unlock: 5138out_unlock:
5191 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 5139 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
@@ -5390,7 +5338,6 @@ free:
5390void btrfs_drop_inode(struct inode *inode) 5338void btrfs_drop_inode(struct inode *inode)
5391{ 5339{
5392 struct btrfs_root *root = BTRFS_I(inode)->root; 5340 struct btrfs_root *root = BTRFS_I(inode)->root;
5393
5394 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) 5341 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
5395 generic_delete_inode(inode); 5342 generic_delete_inode(inode);
5396 else 5343 else
@@ -5789,22 +5736,20 @@ out_fail:
5789} 5736}
5790 5737
5791static int prealloc_file_range(struct inode *inode, u64 start, u64 end, 5738static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
5792 u64 alloc_hint, int mode) 5739 u64 alloc_hint, int mode, loff_t actual_len)
5793{ 5740{
5794 struct btrfs_trans_handle *trans; 5741 struct btrfs_trans_handle *trans;
5795 struct btrfs_root *root = BTRFS_I(inode)->root; 5742 struct btrfs_root *root = BTRFS_I(inode)->root;
5796 struct btrfs_key ins; 5743 struct btrfs_key ins;
5797 u64 alloc_size;
5798 u64 cur_offset = start; 5744 u64 cur_offset = start;
5799 u64 num_bytes = end - start; 5745 u64 num_bytes = end - start;
5800 int ret = 0; 5746 int ret = 0;
5747 u64 i_size;
5801 5748
5802 while (num_bytes > 0) { 5749 while (num_bytes > 0) {
5803 alloc_size = min(num_bytes, root->fs_info->max_extent);
5804
5805 trans = btrfs_start_transaction(root, 1); 5750 trans = btrfs_start_transaction(root, 1);
5806 5751
5807 ret = btrfs_reserve_extent(trans, root, alloc_size, 5752 ret = btrfs_reserve_extent(trans, root, num_bytes,
5808 root->sectorsize, 0, alloc_hint, 5753 root->sectorsize, 0, alloc_hint,
5809 (u64)-1, &ins, 1); 5754 (u64)-1, &ins, 1);
5810 if (ret) { 5755 if (ret) {
@@ -5835,9 +5780,15 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
5835 inode->i_ctime = CURRENT_TIME; 5780 inode->i_ctime = CURRENT_TIME;
5836 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 5781 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
5837 if (!(mode & FALLOC_FL_KEEP_SIZE) && 5782 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
5838 cur_offset > inode->i_size) { 5783 (actual_len > inode->i_size) &&
5839 i_size_write(inode, cur_offset); 5784 (cur_offset > inode->i_size)) {
5840 btrfs_ordered_update_i_size(inode, cur_offset, NULL); 5785
5786 if (cur_offset > actual_len)
5787 i_size = actual_len;
5788 else
5789 i_size = cur_offset;
5790 i_size_write(inode, i_size);
5791 btrfs_ordered_update_i_size(inode, i_size, NULL);
5841 } 5792 }
5842 5793
5843 ret = btrfs_update_inode(trans, root, inode); 5794 ret = btrfs_update_inode(trans, root, inode);
@@ -5857,6 +5808,7 @@ stop_trans:
5857static long btrfs_fallocate(struct inode *inode, int mode, 5808static long btrfs_fallocate(struct inode *inode, int mode,
5858 loff_t offset, loff_t len) 5809 loff_t offset, loff_t len)
5859{ 5810{
5811 struct extent_state *cached_state = NULL;
5860 u64 cur_offset; 5812 u64 cur_offset;
5861 u64 last_byte; 5813 u64 last_byte;
5862 u64 alloc_start; 5814 u64 alloc_start;
@@ -5895,16 +5847,17 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5895 /* the extent lock is ordered inside the running 5847 /* the extent lock is ordered inside the running
5896 * transaction 5848 * transaction
5897 */ 5849 */
5898 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 5850 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
5899 GFP_NOFS); 5851 locked_end, 0, &cached_state, GFP_NOFS);
5900 ordered = btrfs_lookup_first_ordered_extent(inode, 5852 ordered = btrfs_lookup_first_ordered_extent(inode,
5901 alloc_end - 1); 5853 alloc_end - 1);
5902 if (ordered && 5854 if (ordered &&
5903 ordered->file_offset + ordered->len > alloc_start && 5855 ordered->file_offset + ordered->len > alloc_start &&
5904 ordered->file_offset < alloc_end) { 5856 ordered->file_offset < alloc_end) {
5905 btrfs_put_ordered_extent(ordered); 5857 btrfs_put_ordered_extent(ordered);
5906 unlock_extent(&BTRFS_I(inode)->io_tree, 5858 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
5907 alloc_start, locked_end, GFP_NOFS); 5859 alloc_start, locked_end,
5860 &cached_state, GFP_NOFS);
5908 /* 5861 /*
5909 * we can't wait on the range with the transaction 5862 * we can't wait on the range with the transaction
5910 * running or with the extent lock held 5863 * running or with the extent lock held
@@ -5930,7 +5883,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5930 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 5883 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5931 ret = prealloc_file_range(inode, 5884 ret = prealloc_file_range(inode,
5932 cur_offset, last_byte, 5885 cur_offset, last_byte,
5933 alloc_hint, mode); 5886 alloc_hint, mode, offset+len);
5934 if (ret < 0) { 5887 if (ret < 0) {
5935 free_extent_map(em); 5888 free_extent_map(em);
5936 break; 5889 break;
@@ -5946,8 +5899,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5946 break; 5899 break;
5947 } 5900 }
5948 } 5901 }
5949 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 5902 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5950 GFP_NOFS); 5903 &cached_state, GFP_NOFS);
5951 5904
5952 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, 5905 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
5953 alloc_end - alloc_start); 5906 alloc_end - alloc_start);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 645a17927a8f..e84ef60ffe35 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -39,6 +39,7 @@
39#include <linux/security.h> 39#include <linux/security.h>
40#include <linux/xattr.h> 40#include <linux/xattr.h>
41#include <linux/vmalloc.h> 41#include <linux/vmalloc.h>
42#include <linux/slab.h>
42#include "compat.h" 43#include "compat.h"
43#include "ctree.h" 44#include "ctree.h"
44#include "disk-io.h" 45#include "disk-io.h"
@@ -474,7 +475,79 @@ out_unlock:
474 return error; 475 return error;
475} 476}
476 477
477static int btrfs_defrag_file(struct file *file) 478static int should_defrag_range(struct inode *inode, u64 start, u64 len,
479 int thresh, u64 *last_len, u64 *skip,
480 u64 *defrag_end)
481{
482 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
483 struct extent_map *em = NULL;
484 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
485 int ret = 1;
486
487
488 if (thresh == 0)
489 thresh = 256 * 1024;
490
491 /*
492 * make sure that once we start defragging and extent, we keep on
493 * defragging it
494 */
495 if (start < *defrag_end)
496 return 1;
497
498 *skip = 0;
499
500 /*
501 * hopefully we have this extent in the tree already, try without
502 * the full extent lock
503 */
504 read_lock(&em_tree->lock);
505 em = lookup_extent_mapping(em_tree, start, len);
506 read_unlock(&em_tree->lock);
507
508 if (!em) {
509 /* get the big lock and read metadata off disk */
510 lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
511 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
512 unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
513
514 if (IS_ERR(em))
515 return 0;
516 }
517
518 /* this will cover holes, and inline extents */
519 if (em->block_start >= EXTENT_MAP_LAST_BYTE)
520 ret = 0;
521
522 /*
523 * we hit a real extent, if it is big don't bother defragging it again
524 */
525 if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
526 ret = 0;
527
528 /*
529 * last_len ends up being a counter of how many bytes we've defragged.
530 * every time we choose not to defrag an extent, we reset *last_len
531 * so that the next tiny extent will force a defrag.
532 *
533 * The end result of this is that tiny extents before a single big
534 * extent will force at least part of that big extent to be defragged.
535 */
536 if (ret) {
537 *last_len += len;
538 *defrag_end = extent_map_end(em);
539 } else {
540 *last_len = 0;
541 *skip = extent_map_end(em);
542 *defrag_end = 0;
543 }
544
545 free_extent_map(em);
546 return ret;
547}
548
549static int btrfs_defrag_file(struct file *file,
550 struct btrfs_ioctl_defrag_range_args *range)
478{ 551{
479 struct inode *inode = fdentry(file)->d_inode; 552 struct inode *inode = fdentry(file)->d_inode;
480 struct btrfs_root *root = BTRFS_I(inode)->root; 553 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -486,37 +559,96 @@ static int btrfs_defrag_file(struct file *file)
486 unsigned long total_read = 0; 559 unsigned long total_read = 0;
487 u64 page_start; 560 u64 page_start;
488 u64 page_end; 561 u64 page_end;
562 u64 last_len = 0;
563 u64 skip = 0;
564 u64 defrag_end = 0;
489 unsigned long i; 565 unsigned long i;
490 int ret; 566 int ret;
491 567
492 ret = btrfs_check_data_free_space(root, inode, inode->i_size); 568 if (inode->i_size == 0)
493 if (ret) 569 return 0;
494 return -ENOSPC; 570
571 if (range->start + range->len > range->start) {
572 last_index = min_t(u64, inode->i_size - 1,
573 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
574 } else {
575 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
576 }
577
578 i = range->start >> PAGE_CACHE_SHIFT;
579 while (i <= last_index) {
580 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
581 PAGE_CACHE_SIZE,
582 range->extent_thresh,
583 &last_len, &skip,
584 &defrag_end)) {
585 unsigned long next;
586 /*
587 * the should_defrag function tells us how much to skip
588 * bump our counter by the suggested amount
589 */
590 next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
591 i = max(i + 1, next);
592 continue;
593 }
495 594
496 mutex_lock(&inode->i_mutex);
497 last_index = inode->i_size >> PAGE_CACHE_SHIFT;
498 for (i = 0; i <= last_index; i++) {
499 if (total_read % ra_pages == 0) { 595 if (total_read % ra_pages == 0) {
500 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, 596 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
501 min(last_index, i + ra_pages - 1)); 597 min(last_index, i + ra_pages - 1));
502 } 598 }
503 total_read++; 599 total_read++;
600 mutex_lock(&inode->i_mutex);
601 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
602 BTRFS_I(inode)->force_compress = 1;
603
604 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
605 if (ret) {
606 ret = -ENOSPC;
607 break;
608 }
609
610 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
611 if (ret) {
612 btrfs_free_reserved_data_space(root, inode,
613 PAGE_CACHE_SIZE);
614 ret = -ENOSPC;
615 break;
616 }
504again: 617again:
618 if (inode->i_size == 0 ||
619 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
620 ret = 0;
621 goto err_reservations;
622 }
623
505 page = grab_cache_page(inode->i_mapping, i); 624 page = grab_cache_page(inode->i_mapping, i);
506 if (!page) 625 if (!page)
507 goto out_unlock; 626 goto err_reservations;
627
508 if (!PageUptodate(page)) { 628 if (!PageUptodate(page)) {
509 btrfs_readpage(NULL, page); 629 btrfs_readpage(NULL, page);
510 lock_page(page); 630 lock_page(page);
511 if (!PageUptodate(page)) { 631 if (!PageUptodate(page)) {
512 unlock_page(page); 632 unlock_page(page);
513 page_cache_release(page); 633 page_cache_release(page);
514 goto out_unlock; 634 goto err_reservations;
515 } 635 }
516 } 636 }
517 637
638 if (page->mapping != inode->i_mapping) {
639 unlock_page(page);
640 page_cache_release(page);
641 goto again;
642 }
643
518 wait_on_page_writeback(page); 644 wait_on_page_writeback(page);
519 645
646 if (PageDirty(page)) {
647 btrfs_free_reserved_data_space(root, inode,
648 PAGE_CACHE_SIZE);
649 goto loop_unlock;
650 }
651
520 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 652 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
521 page_end = page_start + PAGE_CACHE_SIZE - 1; 653 page_end = page_start + PAGE_CACHE_SIZE - 1;
522 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 654 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -537,18 +669,54 @@ again:
537 * page if it is dirtied again later 669 * page if it is dirtied again later
538 */ 670 */
539 clear_page_dirty_for_io(page); 671 clear_page_dirty_for_io(page);
672 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
673 page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
674 EXTENT_DO_ACCOUNTING, GFP_NOFS);
540 675
541 btrfs_set_extent_delalloc(inode, page_start, page_end); 676 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
677 ClearPageChecked(page);
542 set_page_dirty(page); 678 set_page_dirty(page);
543 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 679 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
680
681loop_unlock:
544 unlock_page(page); 682 unlock_page(page);
545 page_cache_release(page); 683 page_cache_release(page);
684 mutex_unlock(&inode->i_mutex);
685
686 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
546 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 687 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
688 i++;
689 }
690
691 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
692 filemap_flush(inode->i_mapping);
693
694 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
695 /* the filemap_flush will queue IO into the worker threads, but
696 * we have to make sure the IO is actually started and that
697 * ordered extents get created before we return
698 */
699 atomic_inc(&root->fs_info->async_submit_draining);
700 while (atomic_read(&root->fs_info->nr_async_submits) ||
701 atomic_read(&root->fs_info->async_delalloc_pages)) {
702 wait_event(root->fs_info->async_submit_wait,
703 (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
704 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
705 }
706 atomic_dec(&root->fs_info->async_submit_draining);
707
708 mutex_lock(&inode->i_mutex);
709 BTRFS_I(inode)->force_compress = 0;
710 mutex_unlock(&inode->i_mutex);
547 } 711 }
548 712
549out_unlock:
550 mutex_unlock(&inode->i_mutex);
551 return 0; 713 return 0;
714
715err_reservations:
716 mutex_unlock(&inode->i_mutex);
717 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
718 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
719 return ret;
552} 720}
553 721
554static noinline int btrfs_ioctl_resize(struct btrfs_root *root, 722static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
@@ -608,7 +776,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
608 mod = 1; 776 mod = 1;
609 sizestr++; 777 sizestr++;
610 } 778 }
611 new_size = btrfs_parse_size(sizestr); 779 new_size = memparse(sizestr, NULL);
612 if (new_size == 0) { 780 if (new_size == 0) {
613 ret = -EINVAL; 781 ret = -EINVAL;
614 goto out_unlock; 782 goto out_unlock;
@@ -743,6 +911,330 @@ out:
743 return ret; 911 return ret;
744} 912}
745 913
914static noinline int key_in_sk(struct btrfs_key *key,
915 struct btrfs_ioctl_search_key *sk)
916{
917 struct btrfs_key test;
918 int ret;
919
920 test.objectid = sk->min_objectid;
921 test.type = sk->min_type;
922 test.offset = sk->min_offset;
923
924 ret = btrfs_comp_cpu_keys(key, &test);
925 if (ret < 0)
926 return 0;
927
928 test.objectid = sk->max_objectid;
929 test.type = sk->max_type;
930 test.offset = sk->max_offset;
931
932 ret = btrfs_comp_cpu_keys(key, &test);
933 if (ret > 0)
934 return 0;
935 return 1;
936}
937
938static noinline int copy_to_sk(struct btrfs_root *root,
939 struct btrfs_path *path,
940 struct btrfs_key *key,
941 struct btrfs_ioctl_search_key *sk,
942 char *buf,
943 unsigned long *sk_offset,
944 int *num_found)
945{
946 u64 found_transid;
947 struct extent_buffer *leaf;
948 struct btrfs_ioctl_search_header sh;
949 unsigned long item_off;
950 unsigned long item_len;
951 int nritems;
952 int i;
953 int slot;
954 int found = 0;
955 int ret = 0;
956
957 leaf = path->nodes[0];
958 slot = path->slots[0];
959 nritems = btrfs_header_nritems(leaf);
960
961 if (btrfs_header_generation(leaf) > sk->max_transid) {
962 i = nritems;
963 goto advance_key;
964 }
965 found_transid = btrfs_header_generation(leaf);
966
967 for (i = slot; i < nritems; i++) {
968 item_off = btrfs_item_ptr_offset(leaf, i);
969 item_len = btrfs_item_size_nr(leaf, i);
970
971 if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
972 item_len = 0;
973
974 if (sizeof(sh) + item_len + *sk_offset >
975 BTRFS_SEARCH_ARGS_BUFSIZE) {
976 ret = 1;
977 goto overflow;
978 }
979
980 btrfs_item_key_to_cpu(leaf, key, i);
981 if (!key_in_sk(key, sk))
982 continue;
983
984 sh.objectid = key->objectid;
985 sh.offset = key->offset;
986 sh.type = key->type;
987 sh.len = item_len;
988 sh.transid = found_transid;
989
990 /* copy search result header */
991 memcpy(buf + *sk_offset, &sh, sizeof(sh));
992 *sk_offset += sizeof(sh);
993
994 if (item_len) {
995 char *p = buf + *sk_offset;
996 /* copy the item */
997 read_extent_buffer(leaf, p,
998 item_off, item_len);
999 *sk_offset += item_len;
1000 }
1001 found++;
1002
1003 if (*num_found >= sk->nr_items)
1004 break;
1005 }
1006advance_key:
1007 ret = 0;
1008 if (key->offset < (u64)-1 && key->offset < sk->max_offset)
1009 key->offset++;
1010 else if (key->type < (u8)-1 && key->type < sk->max_type) {
1011 key->offset = 0;
1012 key->type++;
1013 } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
1014 key->offset = 0;
1015 key->type = 0;
1016 key->objectid++;
1017 } else
1018 ret = 1;
1019overflow:
1020 *num_found += found;
1021 return ret;
1022}
1023
1024static noinline int search_ioctl(struct inode *inode,
1025 struct btrfs_ioctl_search_args *args)
1026{
1027 struct btrfs_root *root;
1028 struct btrfs_key key;
1029 struct btrfs_key max_key;
1030 struct btrfs_path *path;
1031 struct btrfs_ioctl_search_key *sk = &args->key;
1032 struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
1033 int ret;
1034 int num_found = 0;
1035 unsigned long sk_offset = 0;
1036
1037 path = btrfs_alloc_path();
1038 if (!path)
1039 return -ENOMEM;
1040
1041 if (sk->tree_id == 0) {
1042 /* search the root of the inode that was passed */
1043 root = BTRFS_I(inode)->root;
1044 } else {
1045 key.objectid = sk->tree_id;
1046 key.type = BTRFS_ROOT_ITEM_KEY;
1047 key.offset = (u64)-1;
1048 root = btrfs_read_fs_root_no_name(info, &key);
1049 if (IS_ERR(root)) {
1050 printk(KERN_ERR "could not find root %llu\n",
1051 sk->tree_id);
1052 btrfs_free_path(path);
1053 return -ENOENT;
1054 }
1055 }
1056
1057 key.objectid = sk->min_objectid;
1058 key.type = sk->min_type;
1059 key.offset = sk->min_offset;
1060
1061 max_key.objectid = sk->max_objectid;
1062 max_key.type = sk->max_type;
1063 max_key.offset = sk->max_offset;
1064
1065 path->keep_locks = 1;
1066
1067 while(1) {
1068 ret = btrfs_search_forward(root, &key, &max_key, path, 0,
1069 sk->min_transid);
1070 if (ret != 0) {
1071 if (ret > 0)
1072 ret = 0;
1073 goto err;
1074 }
1075 ret = copy_to_sk(root, path, &key, sk, args->buf,
1076 &sk_offset, &num_found);
1077 btrfs_release_path(root, path);
1078 if (ret || num_found >= sk->nr_items)
1079 break;
1080
1081 }
1082 ret = 0;
1083err:
1084 sk->nr_items = num_found;
1085 btrfs_free_path(path);
1086 return ret;
1087}
1088
1089static noinline int btrfs_ioctl_tree_search(struct file *file,
1090 void __user *argp)
1091{
1092 struct btrfs_ioctl_search_args *args;
1093 struct inode *inode;
1094 int ret;
1095
1096 if (!capable(CAP_SYS_ADMIN))
1097 return -EPERM;
1098
1099 args = kmalloc(sizeof(*args), GFP_KERNEL);
1100 if (!args)
1101 return -ENOMEM;
1102
1103 if (copy_from_user(args, argp, sizeof(*args))) {
1104 kfree(args);
1105 return -EFAULT;
1106 }
1107 inode = fdentry(file)->d_inode;
1108 ret = search_ioctl(inode, args);
1109 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1110 ret = -EFAULT;
1111 kfree(args);
1112 return ret;
1113}
1114
1115/*
1116 * Search INODE_REFs to identify path name of 'dirid' directory
1117 * in a 'tree_id' tree. and sets path name to 'name'.
1118 */
1119static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
1120 u64 tree_id, u64 dirid, char *name)
1121{
1122 struct btrfs_root *root;
1123 struct btrfs_key key;
1124 char *ptr;
1125 int ret = -1;
1126 int slot;
1127 int len;
1128 int total_len = 0;
1129 struct btrfs_inode_ref *iref;
1130 struct extent_buffer *l;
1131 struct btrfs_path *path;
1132
1133 if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
1134 name[0]='\0';
1135 return 0;
1136 }
1137
1138 path = btrfs_alloc_path();
1139 if (!path)
1140 return -ENOMEM;
1141
1142 ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
1143
1144 key.objectid = tree_id;
1145 key.type = BTRFS_ROOT_ITEM_KEY;
1146 key.offset = (u64)-1;
1147 root = btrfs_read_fs_root_no_name(info, &key);
1148 if (IS_ERR(root)) {
1149 printk(KERN_ERR "could not find root %llu\n", tree_id);
1150 ret = -ENOENT;
1151 goto out;
1152 }
1153
1154 key.objectid = dirid;
1155 key.type = BTRFS_INODE_REF_KEY;
1156 key.offset = (u64)-1;
1157
1158 while(1) {
1159 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1160 if (ret < 0)
1161 goto out;
1162
1163 l = path->nodes[0];
1164 slot = path->slots[0];
1165 if (ret > 0 && slot > 0)
1166 slot--;
1167 btrfs_item_key_to_cpu(l, &key, slot);
1168
1169 if (ret > 0 && (key.objectid != dirid ||
1170 key.type != BTRFS_INODE_REF_KEY)) {
1171 ret = -ENOENT;
1172 goto out;
1173 }
1174
1175 iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
1176 len = btrfs_inode_ref_name_len(l, iref);
1177 ptr -= len + 1;
1178 total_len += len + 1;
1179 if (ptr < name)
1180 goto out;
1181
1182 *(ptr + len) = '/';
1183 read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
1184
1185 if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
1186 break;
1187
1188 btrfs_release_path(root, path);
1189 key.objectid = key.offset;
1190 key.offset = (u64)-1;
1191 dirid = key.objectid;
1192
1193 }
1194 if (ptr < name)
1195 goto out;
1196 memcpy(name, ptr, total_len);
1197 name[total_len]='\0';
1198 ret = 0;
1199out:
1200 btrfs_free_path(path);
1201 return ret;
1202}
1203
1204static noinline int btrfs_ioctl_ino_lookup(struct file *file,
1205 void __user *argp)
1206{
1207 struct btrfs_ioctl_ino_lookup_args *args;
1208 struct inode *inode;
1209 int ret;
1210
1211 if (!capable(CAP_SYS_ADMIN))
1212 return -EPERM;
1213
1214 args = kmalloc(sizeof(*args), GFP_KERNEL);
1215 if (!args)
1216 return -ENOMEM;
1217
1218 if (copy_from_user(args, argp, sizeof(*args))) {
1219 kfree(args);
1220 return -EFAULT;
1221 }
1222 inode = fdentry(file)->d_inode;
1223
1224 if (args->treeid == 0)
1225 args->treeid = BTRFS_I(inode)->root->root_key.objectid;
1226
1227 ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
1228 args->treeid, args->objectid,
1229 args->name);
1230
1231 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1232 ret = -EFAULT;
1233
1234 kfree(args);
1235 return ret;
1236}
1237
746static noinline int btrfs_ioctl_snap_destroy(struct file *file, 1238static noinline int btrfs_ioctl_snap_destroy(struct file *file,
747 void __user *arg) 1239 void __user *arg)
748{ 1240{
@@ -849,10 +1341,11 @@ out:
849 return err; 1341 return err;
850} 1342}
851 1343
852static int btrfs_ioctl_defrag(struct file *file) 1344static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
853{ 1345{
854 struct inode *inode = fdentry(file)->d_inode; 1346 struct inode *inode = fdentry(file)->d_inode;
855 struct btrfs_root *root = BTRFS_I(inode)->root; 1347 struct btrfs_root *root = BTRFS_I(inode)->root;
1348 struct btrfs_ioctl_defrag_range_args *range;
856 int ret; 1349 int ret;
857 1350
858 ret = mnt_want_write(file->f_path.mnt); 1351 ret = mnt_want_write(file->f_path.mnt);
@@ -873,7 +1366,31 @@ static int btrfs_ioctl_defrag(struct file *file)
873 ret = -EINVAL; 1366 ret = -EINVAL;
874 goto out; 1367 goto out;
875 } 1368 }
876 btrfs_defrag_file(file); 1369
1370 range = kzalloc(sizeof(*range), GFP_KERNEL);
1371 if (!range) {
1372 ret = -ENOMEM;
1373 goto out;
1374 }
1375
1376 if (argp) {
1377 if (copy_from_user(range, argp,
1378 sizeof(*range))) {
1379 ret = -EFAULT;
1380 kfree(range);
1381 goto out;
1382 }
1383 /* compression requires us to start the IO */
1384 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
1385 range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
1386 range->extent_thresh = (u32)-1;
1387 }
1388 } else {
1389 /* the rest are all set to zero by kzalloc */
1390 range->len = (u64)-1;
1391 }
1392 btrfs_defrag_file(file, range);
1393 kfree(range);
877 break; 1394 break;
878 } 1395 }
879out: 1396out:
@@ -1274,6 +1791,157 @@ out:
1274 return ret; 1791 return ret;
1275} 1792}
1276 1793
1794static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
1795{
1796 struct inode *inode = fdentry(file)->d_inode;
1797 struct btrfs_root *root = BTRFS_I(inode)->root;
1798 struct btrfs_root *new_root;
1799 struct btrfs_dir_item *di;
1800 struct btrfs_trans_handle *trans;
1801 struct btrfs_path *path;
1802 struct btrfs_key location;
1803 struct btrfs_disk_key disk_key;
1804 struct btrfs_super_block *disk_super;
1805 u64 features;
1806 u64 objectid = 0;
1807 u64 dir_id;
1808
1809 if (!capable(CAP_SYS_ADMIN))
1810 return -EPERM;
1811
1812 if (copy_from_user(&objectid, argp, sizeof(objectid)))
1813 return -EFAULT;
1814
1815 if (!objectid)
1816 objectid = root->root_key.objectid;
1817
1818 location.objectid = objectid;
1819 location.type = BTRFS_ROOT_ITEM_KEY;
1820 location.offset = (u64)-1;
1821
1822 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
1823 if (IS_ERR(new_root))
1824 return PTR_ERR(new_root);
1825
1826 if (btrfs_root_refs(&new_root->root_item) == 0)
1827 return -ENOENT;
1828
1829 path = btrfs_alloc_path();
1830 if (!path)
1831 return -ENOMEM;
1832 path->leave_spinning = 1;
1833
1834 trans = btrfs_start_transaction(root, 1);
1835 if (!trans) {
1836 btrfs_free_path(path);
1837 return -ENOMEM;
1838 }
1839
1840 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
1841 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
1842 dir_id, "default", 7, 1);
1843 if (!di) {
1844 btrfs_free_path(path);
1845 btrfs_end_transaction(trans, root);
1846 printk(KERN_ERR "Umm, you don't have the default dir item, "
1847 "this isn't going to work\n");
1848 return -ENOENT;
1849 }
1850
1851 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
1852 btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
1853 btrfs_mark_buffer_dirty(path->nodes[0]);
1854 btrfs_free_path(path);
1855
1856 disk_super = &root->fs_info->super_copy;
1857 features = btrfs_super_incompat_flags(disk_super);
1858 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
1859 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
1860 btrfs_set_super_incompat_flags(disk_super, features);
1861 }
1862 btrfs_end_transaction(trans, root);
1863
1864 return 0;
1865}
1866
1867long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
1868{
1869 struct btrfs_ioctl_space_args space_args;
1870 struct btrfs_ioctl_space_info space;
1871 struct btrfs_ioctl_space_info *dest;
1872 struct btrfs_ioctl_space_info *dest_orig;
1873 struct btrfs_ioctl_space_info *user_dest;
1874 struct btrfs_space_info *info;
1875 int alloc_size;
1876 int ret = 0;
1877 int slot_count = 0;
1878
1879 if (copy_from_user(&space_args,
1880 (struct btrfs_ioctl_space_args __user *)arg,
1881 sizeof(space_args)))
1882 return -EFAULT;
1883
1884 /* first we count slots */
1885 rcu_read_lock();
1886 list_for_each_entry_rcu(info, &root->fs_info->space_info, list)
1887 slot_count++;
1888 rcu_read_unlock();
1889
1890 /* space_slots == 0 means they are asking for a count */
1891 if (space_args.space_slots == 0) {
1892 space_args.total_spaces = slot_count;
1893 goto out;
1894 }
1895 alloc_size = sizeof(*dest) * slot_count;
1896 /* we generally have at most 6 or so space infos, one for each raid
1897 * level. So, a whole page should be more than enough for everyone
1898 */
1899 if (alloc_size > PAGE_CACHE_SIZE)
1900 return -ENOMEM;
1901
1902 space_args.total_spaces = 0;
1903 dest = kmalloc(alloc_size, GFP_NOFS);
1904 if (!dest)
1905 return -ENOMEM;
1906 dest_orig = dest;
1907
1908 /* now we have a buffer to copy into */
1909 rcu_read_lock();
1910 list_for_each_entry_rcu(info, &root->fs_info->space_info, list) {
1911 /* make sure we don't copy more than we allocated
1912 * in our buffer
1913 */
1914 if (slot_count == 0)
1915 break;
1916 slot_count--;
1917
1918 /* make sure userland has enough room in their buffer */
1919 if (space_args.total_spaces >= space_args.space_slots)
1920 break;
1921
1922 space.flags = info->flags;
1923 space.total_bytes = info->total_bytes;
1924 space.used_bytes = info->bytes_used;
1925 memcpy(dest, &space, sizeof(space));
1926 dest++;
1927 space_args.total_spaces++;
1928 }
1929 rcu_read_unlock();
1930
1931 user_dest = (struct btrfs_ioctl_space_info *)
1932 (arg + sizeof(struct btrfs_ioctl_space_args));
1933
1934 if (copy_to_user(user_dest, dest_orig, alloc_size))
1935 ret = -EFAULT;
1936
1937 kfree(dest_orig);
1938out:
1939 if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
1940 ret = -EFAULT;
1941
1942 return ret;
1943}
1944
1277/* 1945/*
1278 * there are many ways the trans_start and trans_end ioctls can lead 1946 * there are many ways the trans_start and trans_end ioctls can lead
1279 * to deadlocks. They should only be used by applications that 1947 * to deadlocks. They should only be used by applications that
@@ -1320,8 +1988,12 @@ long btrfs_ioctl(struct file *file, unsigned int
1320 return btrfs_ioctl_snap_create(file, argp, 1); 1988 return btrfs_ioctl_snap_create(file, argp, 1);
1321 case BTRFS_IOC_SNAP_DESTROY: 1989 case BTRFS_IOC_SNAP_DESTROY:
1322 return btrfs_ioctl_snap_destroy(file, argp); 1990 return btrfs_ioctl_snap_destroy(file, argp);
1991 case BTRFS_IOC_DEFAULT_SUBVOL:
1992 return btrfs_ioctl_default_subvol(file, argp);
1323 case BTRFS_IOC_DEFRAG: 1993 case BTRFS_IOC_DEFRAG:
1324 return btrfs_ioctl_defrag(file); 1994 return btrfs_ioctl_defrag(file, NULL);
1995 case BTRFS_IOC_DEFRAG_RANGE:
1996 return btrfs_ioctl_defrag(file, argp);
1325 case BTRFS_IOC_RESIZE: 1997 case BTRFS_IOC_RESIZE:
1326 return btrfs_ioctl_resize(root, argp); 1998 return btrfs_ioctl_resize(root, argp);
1327 case BTRFS_IOC_ADD_DEV: 1999 case BTRFS_IOC_ADD_DEV:
@@ -1338,6 +2010,12 @@ long btrfs_ioctl(struct file *file, unsigned int
1338 return btrfs_ioctl_trans_start(file); 2010 return btrfs_ioctl_trans_start(file);
1339 case BTRFS_IOC_TRANS_END: 2011 case BTRFS_IOC_TRANS_END:
1340 return btrfs_ioctl_trans_end(file); 2012 return btrfs_ioctl_trans_end(file);
2013 case BTRFS_IOC_TREE_SEARCH:
2014 return btrfs_ioctl_tree_search(file, argp);
2015 case BTRFS_IOC_INO_LOOKUP:
2016 return btrfs_ioctl_ino_lookup(file, argp);
2017 case BTRFS_IOC_SPACE_INFO:
2018 return btrfs_ioctl_space_info(root, argp);
1341 case BTRFS_IOC_SYNC: 2019 case BTRFS_IOC_SYNC:
1342 btrfs_sync_fs(file->f_dentry->d_sb, 1); 2020 btrfs_sync_fs(file->f_dentry->d_sb, 1);
1343 return 0; 2021 return 0;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index bc49914475eb..424694aa517f 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,12 +30,114 @@ struct btrfs_ioctl_vol_args {
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_INO_LOOKUP_PATH_MAX 4080
34struct btrfs_ioctl_ino_lookup_args {
35 __u64 treeid;
36 __u64 objectid;
37 char name[BTRFS_INO_LOOKUP_PATH_MAX];
38};
39
40struct btrfs_ioctl_search_key {
41 /* which root are we searching. 0 is the tree of tree roots */
42 __u64 tree_id;
43
44 /* keys returned will be >= min and <= max */
45 __u64 min_objectid;
46 __u64 max_objectid;
47
48 /* keys returned will be >= min and <= max */
49 __u64 min_offset;
50 __u64 max_offset;
51
52 /* max and min transids to search for */
53 __u64 min_transid;
54 __u64 max_transid;
55
56 /* keys returned will be >= min and <= max */
57 __u32 min_type;
58 __u32 max_type;
59
60 /*
61 * how many items did userland ask for, and how many are we
62 * returning
63 */
64 __u32 nr_items;
65
66 /* align to 64 bits */
67 __u32 unused;
68
69 /* some extra for later */
70 __u64 unused1;
71 __u64 unused2;
72 __u64 unused3;
73 __u64 unused4;
74};
75
76struct btrfs_ioctl_search_header {
77 __u64 transid;
78 __u64 objectid;
79 __u64 offset;
80 __u32 type;
81 __u32 len;
82};
83
84#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
85/*
86 * the buf is an array of search headers where
87 * each header is followed by the actual item
88 * the type field is expanded to 32 bits for alignment
89 */
90struct btrfs_ioctl_search_args {
91 struct btrfs_ioctl_search_key key;
92 char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
93};
94
33struct btrfs_ioctl_clone_range_args { 95struct btrfs_ioctl_clone_range_args {
34 __s64 src_fd; 96 __s64 src_fd;
35 __u64 src_offset, src_length; 97 __u64 src_offset, src_length;
36 __u64 dest_offset; 98 __u64 dest_offset;
37}; 99};
38 100
101/* flags for the defrag range ioctl */
102#define BTRFS_DEFRAG_RANGE_COMPRESS 1
103#define BTRFS_DEFRAG_RANGE_START_IO 2
104
105struct btrfs_ioctl_defrag_range_args {
106 /* start of the defrag operation */
107 __u64 start;
108
109 /* number of bytes to defrag, use (u64)-1 to say all */
110 __u64 len;
111
112 /*
113 * flags for the operation, which can include turning
114 * on compression for this one defrag
115 */
116 __u64 flags;
117
118 /*
119 * any extent bigger than this will be considered
120 * already defragged. Use 0 to take the kernel default
121 * Use 1 to say every single extent must be rewritten
122 */
123 __u32 extent_thresh;
124
125 /* spare for later */
126 __u32 unused[5];
127};
128
129struct btrfs_ioctl_space_info {
130 __u64 flags;
131 __u64 total_bytes;
132 __u64 used_bytes;
133};
134
135struct btrfs_ioctl_space_args {
136 __u64 space_slots;
137 __u64 total_spaces;
138 struct btrfs_ioctl_space_info spaces[0];
139};
140
39#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 141#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
40 struct btrfs_ioctl_vol_args) 142 struct btrfs_ioctl_vol_args)
41#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 143#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -67,4 +169,13 @@ struct btrfs_ioctl_clone_range_args {
67 struct btrfs_ioctl_vol_args) 169 struct btrfs_ioctl_vol_args)
68#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ 170#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
69 struct btrfs_ioctl_vol_args) 171 struct btrfs_ioctl_vol_args)
172#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
173 struct btrfs_ioctl_defrag_range_args)
174#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
175 struct btrfs_ioctl_search_args)
176#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
177 struct btrfs_ioctl_ino_lookup_args)
178#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
179#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
180 struct btrfs_ioctl_space_args)
70#endif 181#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1c36e5cd8f55..6151f2ea38bb 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/gfp.h>
20#include <linux/pagemap.h> 19#include <linux/pagemap.h>
21#include <linux/spinlock.h> 20#include <linux/spinlock.h>
22#include <linux/page-flags.h> 21#include <linux/page-flags.h>
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b10a49d4bc6a..a127c0ebb2dc 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/gfp.h>
20#include <linux/slab.h> 19#include <linux/slab.h>
21#include <linux/blkdev.h> 20#include <linux/blkdev.h>
22#include <linux/writeback.h> 21#include <linux/writeback.h>
@@ -174,7 +173,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
174 if (!entry) 173 if (!entry)
175 return -ENOMEM; 174 return -ENOMEM;
176 175
177 mutex_lock(&tree->mutex);
178 entry->file_offset = file_offset; 176 entry->file_offset = file_offset;
179 entry->start = start; 177 entry->start = start;
180 entry->len = len; 178 entry->len = len;
@@ -190,16 +188,17 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
190 INIT_LIST_HEAD(&entry->list); 188 INIT_LIST_HEAD(&entry->list);
191 INIT_LIST_HEAD(&entry->root_extent_list); 189 INIT_LIST_HEAD(&entry->root_extent_list);
192 190
191 spin_lock(&tree->lock);
193 node = tree_insert(&tree->tree, file_offset, 192 node = tree_insert(&tree->tree, file_offset,
194 &entry->rb_node); 193 &entry->rb_node);
195 BUG_ON(node); 194 BUG_ON(node);
195 spin_unlock(&tree->lock);
196 196
197 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 197 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
198 list_add_tail(&entry->root_extent_list, 198 list_add_tail(&entry->root_extent_list,
199 &BTRFS_I(inode)->root->fs_info->ordered_extents); 199 &BTRFS_I(inode)->root->fs_info->ordered_extents);
200 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 200 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
201 201
202 mutex_unlock(&tree->mutex);
203 BUG_ON(node); 202 BUG_ON(node);
204 return 0; 203 return 0;
205} 204}
@@ -216,9 +215,9 @@ int btrfs_add_ordered_sum(struct inode *inode,
216 struct btrfs_ordered_inode_tree *tree; 215 struct btrfs_ordered_inode_tree *tree;
217 216
218 tree = &BTRFS_I(inode)->ordered_tree; 217 tree = &BTRFS_I(inode)->ordered_tree;
219 mutex_lock(&tree->mutex); 218 spin_lock(&tree->lock);
220 list_add_tail(&sum->list, &entry->list); 219 list_add_tail(&sum->list, &entry->list);
221 mutex_unlock(&tree->mutex); 220 spin_unlock(&tree->lock);
222 return 0; 221 return 0;
223} 222}
224 223
@@ -232,15 +231,16 @@ int btrfs_add_ordered_sum(struct inode *inode,
232 * to make sure this function only returns 1 once for a given ordered extent. 231 * to make sure this function only returns 1 once for a given ordered extent.
233 */ 232 */
234int btrfs_dec_test_ordered_pending(struct inode *inode, 233int btrfs_dec_test_ordered_pending(struct inode *inode,
234 struct btrfs_ordered_extent **cached,
235 u64 file_offset, u64 io_size) 235 u64 file_offset, u64 io_size)
236{ 236{
237 struct btrfs_ordered_inode_tree *tree; 237 struct btrfs_ordered_inode_tree *tree;
238 struct rb_node *node; 238 struct rb_node *node;
239 struct btrfs_ordered_extent *entry; 239 struct btrfs_ordered_extent *entry = NULL;
240 int ret; 240 int ret;
241 241
242 tree = &BTRFS_I(inode)->ordered_tree; 242 tree = &BTRFS_I(inode)->ordered_tree;
243 mutex_lock(&tree->mutex); 243 spin_lock(&tree->lock);
244 node = tree_search(tree, file_offset); 244 node = tree_search(tree, file_offset);
245 if (!node) { 245 if (!node) {
246 ret = 1; 246 ret = 1;
@@ -264,7 +264,11 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
264 else 264 else
265 ret = 1; 265 ret = 1;
266out: 266out:
267 mutex_unlock(&tree->mutex); 267 if (!ret && cached && entry) {
268 *cached = entry;
269 atomic_inc(&entry->refs);
270 }
271 spin_unlock(&tree->lock);
268 return ret == 0; 272 return ret == 0;
269} 273}
270 274
@@ -291,13 +295,14 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
291 295
292/* 296/*
293 * remove an ordered extent from the tree. No references are dropped 297 * remove an ordered extent from the tree. No references are dropped
294 * and you must wake_up entry->wait. You must hold the tree mutex 298 * and you must wake_up entry->wait. You must hold the tree lock
295 * while you call this function. 299 * while you call this function.
296 */ 300 */
297static int __btrfs_remove_ordered_extent(struct inode *inode, 301static int __btrfs_remove_ordered_extent(struct inode *inode,
298 struct btrfs_ordered_extent *entry) 302 struct btrfs_ordered_extent *entry)
299{ 303{
300 struct btrfs_ordered_inode_tree *tree; 304 struct btrfs_ordered_inode_tree *tree;
305 struct btrfs_root *root = BTRFS_I(inode)->root;
301 struct rb_node *node; 306 struct rb_node *node;
302 307
303 tree = &BTRFS_I(inode)->ordered_tree; 308 tree = &BTRFS_I(inode)->ordered_tree;
@@ -307,12 +312,13 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
307 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 312 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
308 313
309 spin_lock(&BTRFS_I(inode)->accounting_lock); 314 spin_lock(&BTRFS_I(inode)->accounting_lock);
315 WARN_ON(!BTRFS_I(inode)->outstanding_extents);
310 BTRFS_I(inode)->outstanding_extents--; 316 BTRFS_I(inode)->outstanding_extents--;
311 spin_unlock(&BTRFS_I(inode)->accounting_lock); 317 spin_unlock(&BTRFS_I(inode)->accounting_lock);
312 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, 318 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
313 inode, 1); 319 inode, 1);
314 320
315 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 321 spin_lock(&root->fs_info->ordered_extent_lock);
316 list_del_init(&entry->root_extent_list); 322 list_del_init(&entry->root_extent_list);
317 323
318 /* 324 /*
@@ -324,7 +330,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
324 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 330 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
325 list_del_init(&BTRFS_I(inode)->ordered_operations); 331 list_del_init(&BTRFS_I(inode)->ordered_operations);
326 } 332 }
327 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 333 spin_unlock(&root->fs_info->ordered_extent_lock);
328 334
329 return 0; 335 return 0;
330} 336}
@@ -340,9 +346,9 @@ int btrfs_remove_ordered_extent(struct inode *inode,
340 int ret; 346 int ret;
341 347
342 tree = &BTRFS_I(inode)->ordered_tree; 348 tree = &BTRFS_I(inode)->ordered_tree;
343 mutex_lock(&tree->mutex); 349 spin_lock(&tree->lock);
344 ret = __btrfs_remove_ordered_extent(inode, entry); 350 ret = __btrfs_remove_ordered_extent(inode, entry);
345 mutex_unlock(&tree->mutex); 351 spin_unlock(&tree->lock);
346 wake_up(&entry->wait); 352 wake_up(&entry->wait);
347 353
348 return ret; 354 return ret;
@@ -567,7 +573,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
567 struct btrfs_ordered_extent *entry = NULL; 573 struct btrfs_ordered_extent *entry = NULL;
568 574
569 tree = &BTRFS_I(inode)->ordered_tree; 575 tree = &BTRFS_I(inode)->ordered_tree;
570 mutex_lock(&tree->mutex); 576 spin_lock(&tree->lock);
571 node = tree_search(tree, file_offset); 577 node = tree_search(tree, file_offset);
572 if (!node) 578 if (!node)
573 goto out; 579 goto out;
@@ -578,7 +584,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
578 if (entry) 584 if (entry)
579 atomic_inc(&entry->refs); 585 atomic_inc(&entry->refs);
580out: 586out:
581 mutex_unlock(&tree->mutex); 587 spin_unlock(&tree->lock);
582 return entry; 588 return entry;
583} 589}
584 590
@@ -594,7 +600,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
594 struct btrfs_ordered_extent *entry = NULL; 600 struct btrfs_ordered_extent *entry = NULL;
595 601
596 tree = &BTRFS_I(inode)->ordered_tree; 602 tree = &BTRFS_I(inode)->ordered_tree;
597 mutex_lock(&tree->mutex); 603 spin_lock(&tree->lock);
598 node = tree_search(tree, file_offset); 604 node = tree_search(tree, file_offset);
599 if (!node) 605 if (!node)
600 goto out; 606 goto out;
@@ -602,7 +608,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
602 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 608 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
603 atomic_inc(&entry->refs); 609 atomic_inc(&entry->refs);
604out: 610out:
605 mutex_unlock(&tree->mutex); 611 spin_unlock(&tree->lock);
606 return entry; 612 return entry;
607} 613}
608 614
@@ -626,8 +632,10 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
626 632
627 if (ordered) 633 if (ordered)
628 offset = entry_end(ordered); 634 offset = entry_end(ordered);
635 else
636 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
629 637
630 mutex_lock(&tree->mutex); 638 spin_lock(&tree->lock);
631 disk_i_size = BTRFS_I(inode)->disk_i_size; 639 disk_i_size = BTRFS_I(inode)->disk_i_size;
632 640
633 /* truncate file */ 641 /* truncate file */
@@ -733,7 +741,7 @@ out:
733 */ 741 */
734 if (ordered) 742 if (ordered)
735 __btrfs_remove_ordered_extent(inode, ordered); 743 __btrfs_remove_ordered_extent(inode, ordered);
736 mutex_unlock(&tree->mutex); 744 spin_unlock(&tree->lock);
737 if (ordered) 745 if (ordered)
738 wake_up(&ordered->wait); 746 wake_up(&ordered->wait);
739 return ret; 747 return ret;
@@ -760,7 +768,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
760 if (!ordered) 768 if (!ordered)
761 return 1; 769 return 1;
762 770
763 mutex_lock(&tree->mutex); 771 spin_lock(&tree->lock);
764 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { 772 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
765 if (disk_bytenr >= ordered_sum->bytenr) { 773 if (disk_bytenr >= ordered_sum->bytenr) {
766 num_sectors = ordered_sum->len / sectorsize; 774 num_sectors = ordered_sum->len / sectorsize;
@@ -775,7 +783,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
775 } 783 }
776 } 784 }
777out: 785out:
778 mutex_unlock(&tree->mutex); 786 spin_unlock(&tree->lock);
779 btrfs_put_ordered_extent(ordered); 787 btrfs_put_ordered_extent(ordered);
780 return ret; 788 return ret;
781} 789}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 1fe1282ef47c..c82f76a9f040 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -21,7 +21,7 @@
21 21
22/* one of these per inode */ 22/* one of these per inode */
23struct btrfs_ordered_inode_tree { 23struct btrfs_ordered_inode_tree {
24 struct mutex mutex; 24 spinlock_t lock;
25 struct rb_root tree; 25 struct rb_root tree;
26 struct rb_node *last; 26 struct rb_node *last;
27}; 27};
@@ -128,8 +128,8 @@ static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
128static inline void 128static inline void
129btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) 129btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
130{ 130{
131 mutex_init(&t->mutex); 131 spin_lock_init(&t->lock);
132 t->tree.rb_node = NULL; 132 t->tree = RB_ROOT;
133 t->last = NULL; 133 t->last = NULL;
134} 134}
135 135
@@ -137,7 +137,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
137int btrfs_remove_ordered_extent(struct inode *inode, 137int btrfs_remove_ordered_extent(struct inode *inode,
138 struct btrfs_ordered_extent *entry); 138 struct btrfs_ordered_extent *entry);
139int btrfs_dec_test_ordered_pending(struct inode *inode, 139int btrfs_dec_test_ordered_pending(struct inode *inode,
140 u64 file_offset, u64 io_size); 140 struct btrfs_ordered_extent **cached,
141 u64 file_offset, u64 io_size);
141int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 142int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
142 u64 start, u64 len, u64 disk_len, int tyep); 143 u64 start, u64 len, u64 disk_len, int tyep);
143int btrfs_add_ordered_sum(struct inode *inode, 144int btrfs_add_ordered_sum(struct inode *inode,
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index d0cc62bccb94..a97314cf6bd6 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "ref-cache.h" 23#include "ref-cache.h"
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index bc283ad2db73..e2a55cb2072b 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -52,7 +52,7 @@ static inline size_t btrfs_leaf_ref_size(int nr_extents)
52 52
53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree) 53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
54{ 54{
55 tree->root.rb_node = NULL; 55 tree->root = RB_ROOT;
56 INIT_LIST_HEAD(&tree->list); 56 INIT_LIST_HEAD(&tree->list);
57 spin_lock_init(&tree->lock); 57 spin_lock_init(&tree->lock);
58} 58}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index a9728680eca8..e558dd941ded 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/rbtree.h> 23#include <linux/rbtree.h>
24#include <linux/slab.h>
24#include "ctree.h" 25#include "ctree.h"
25#include "disk-io.h" 26#include "disk-io.h"
26#include "transaction.h" 27#include "transaction.h"
@@ -170,14 +171,14 @@ struct async_merge {
170 171
171static void mapping_tree_init(struct mapping_tree *tree) 172static void mapping_tree_init(struct mapping_tree *tree)
172{ 173{
173 tree->rb_root.rb_node = NULL; 174 tree->rb_root = RB_ROOT;
174 spin_lock_init(&tree->lock); 175 spin_lock_init(&tree->lock);
175} 176}
176 177
177static void backref_cache_init(struct backref_cache *cache) 178static void backref_cache_init(struct backref_cache *cache)
178{ 179{
179 int i; 180 int i;
180 cache->rb_root.rb_node = NULL; 181 cache->rb_root = RB_ROOT;
181 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 182 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
182 INIT_LIST_HEAD(&cache->pending[i]); 183 INIT_LIST_HEAD(&cache->pending[i]);
183 spin_lock_init(&cache->lock); 184 spin_lock_init(&cache->lock);
@@ -2659,7 +2660,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2659 EXTENT_BOUNDARY, GFP_NOFS); 2660 EXTENT_BOUNDARY, GFP_NOFS);
2660 nr++; 2661 nr++;
2661 } 2662 }
2662 btrfs_set_extent_delalloc(inode, page_start, page_end); 2663 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2663 2664
2664 set_page_dirty(page); 2665 set_page_dirty(page);
2665 dirty_page++; 2666 dirty_page++;
@@ -3281,8 +3282,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3281 return -ENOMEM; 3282 return -ENOMEM;
3282 3283
3283 path = btrfs_alloc_path(); 3284 path = btrfs_alloc_path();
3284 if (!path) 3285 if (!path) {
3286 kfree(cluster);
3285 return -ENOMEM; 3287 return -ENOMEM;
3288 }
3286 3289
3287 rc->extents_found = 0; 3290 rc->extents_found = 0;
3288 rc->extents_skipped = 0; 3291 rc->extents_skipped = 0;
@@ -3485,7 +3488,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3485 key.objectid = objectid; 3488 key.objectid = objectid;
3486 key.type = BTRFS_INODE_ITEM_KEY; 3489 key.type = BTRFS_INODE_ITEM_KEY;
3487 key.offset = 0; 3490 key.offset = 0;
3488 inode = btrfs_iget(root->fs_info->sb, &key, root); 3491 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
3489 BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); 3492 BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3490 BTRFS_I(inode)->index_cnt = group->key.objectid; 3493 BTRFS_I(inode)->index_cnt = group->key.objectid;
3491 3494
@@ -3762,7 +3765,8 @@ out:
3762 BTRFS_DATA_RELOC_TREE_OBJECTID); 3765 BTRFS_DATA_RELOC_TREE_OBJECTID);
3763 if (IS_ERR(fs_root)) 3766 if (IS_ERR(fs_root))
3764 err = PTR_ERR(fs_root); 3767 err = PTR_ERR(fs_root);
3765 btrfs_orphan_cleanup(fs_root); 3768 else
3769 btrfs_orphan_cleanup(fs_root);
3766 } 3770 }
3767 return err; 3771 return err;
3768} 3772}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3f9b45704fcd..1866dff0538e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -38,6 +38,7 @@
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h>
41#include "compat.h" 42#include "compat.h"
42#include "ctree.h" 43#include "ctree.h"
43#include "disk-io.h" 44#include "disk-io.h"
@@ -63,25 +64,26 @@ static void btrfs_put_super(struct super_block *sb)
63} 64}
64 65
65enum { 66enum {
66 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 67 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
67 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 68 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
68 Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, 69 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
69 Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, 70 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
70 Opt_discard, Opt_err, 71 Opt_discard, Opt_err,
71}; 72};
72 73
73static match_table_t tokens = { 74static match_table_t tokens = {
74 {Opt_degraded, "degraded"}, 75 {Opt_degraded, "degraded"},
75 {Opt_subvol, "subvol=%s"}, 76 {Opt_subvol, "subvol=%s"},
77 {Opt_subvolid, "subvolid=%d"},
76 {Opt_device, "device=%s"}, 78 {Opt_device, "device=%s"},
77 {Opt_nodatasum, "nodatasum"}, 79 {Opt_nodatasum, "nodatasum"},
78 {Opt_nodatacow, "nodatacow"}, 80 {Opt_nodatacow, "nodatacow"},
79 {Opt_nobarrier, "nobarrier"}, 81 {Opt_nobarrier, "nobarrier"},
80 {Opt_max_extent, "max_extent=%s"},
81 {Opt_max_inline, "max_inline=%s"}, 82 {Opt_max_inline, "max_inline=%s"},
82 {Opt_alloc_start, "alloc_start=%s"}, 83 {Opt_alloc_start, "alloc_start=%s"},
83 {Opt_thread_pool, "thread_pool=%d"}, 84 {Opt_thread_pool, "thread_pool=%d"},
84 {Opt_compress, "compress"}, 85 {Opt_compress, "compress"},
86 {Opt_compress_force, "compress-force"},
85 {Opt_ssd, "ssd"}, 87 {Opt_ssd, "ssd"},
86 {Opt_ssd_spread, "ssd_spread"}, 88 {Opt_ssd_spread, "ssd_spread"},
87 {Opt_nossd, "nossd"}, 89 {Opt_nossd, "nossd"},
@@ -93,31 +95,6 @@ static match_table_t tokens = {
93 {Opt_err, NULL}, 95 {Opt_err, NULL},
94}; 96};
95 97
96u64 btrfs_parse_size(char *str)
97{
98 u64 res;
99 int mult = 1;
100 char *end;
101 char last;
102
103 res = simple_strtoul(str, &end, 10);
104
105 last = end[0];
106 if (isalpha(last)) {
107 last = tolower(last);
108 switch (last) {
109 case 'g':
110 mult *= 1024;
111 case 'm':
112 mult *= 1024;
113 case 'k':
114 mult *= 1024;
115 }
116 res = res * mult;
117 }
118 return res;
119}
120
121/* 98/*
122 * Regular mount options parser. Everything that is needed only when 99 * Regular mount options parser. Everything that is needed only when
123 * reading in a new superblock is parsed here. 100 * reading in a new superblock is parsed here.
@@ -126,7 +103,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
126{ 103{
127 struct btrfs_fs_info *info = root->fs_info; 104 struct btrfs_fs_info *info = root->fs_info;
128 substring_t args[MAX_OPT_ARGS]; 105 substring_t args[MAX_OPT_ARGS];
129 char *p, *num; 106 char *p, *num, *orig;
130 int intarg; 107 int intarg;
131 int ret = 0; 108 int ret = 0;
132 109
@@ -141,6 +118,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
141 if (!options) 118 if (!options)
142 return -ENOMEM; 119 return -ENOMEM;
143 120
121 orig = options;
144 122
145 while ((p = strsep(&options, ",")) != NULL) { 123 while ((p = strsep(&options, ",")) != NULL) {
146 int token; 124 int token;
@@ -154,6 +132,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
154 btrfs_set_opt(info->mount_opt, DEGRADED); 132 btrfs_set_opt(info->mount_opt, DEGRADED);
155 break; 133 break;
156 case Opt_subvol: 134 case Opt_subvol:
135 case Opt_subvolid:
157 case Opt_device: 136 case Opt_device:
158 /* 137 /*
159 * These are parsed by btrfs_parse_early_options 138 * These are parsed by btrfs_parse_early_options
@@ -173,6 +152,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
173 printk(KERN_INFO "btrfs: use compression\n"); 152 printk(KERN_INFO "btrfs: use compression\n");
174 btrfs_set_opt(info->mount_opt, COMPRESS); 153 btrfs_set_opt(info->mount_opt, COMPRESS);
175 break; 154 break;
155 case Opt_compress_force:
156 printk(KERN_INFO "btrfs: forcing compression\n");
157 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
158 btrfs_set_opt(info->mount_opt, COMPRESS);
159 break;
176 case Opt_ssd: 160 case Opt_ssd:
177 printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); 161 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
178 btrfs_set_opt(info->mount_opt, SSD); 162 btrfs_set_opt(info->mount_opt, SSD);
@@ -203,22 +187,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
203 info->thread_pool_size); 187 info->thread_pool_size);
204 } 188 }
205 break; 189 break;
206 case Opt_max_extent:
207 num = match_strdup(&args[0]);
208 if (num) {
209 info->max_extent = btrfs_parse_size(num);
210 kfree(num);
211
212 info->max_extent = max_t(u64,
213 info->max_extent, root->sectorsize);
214 printk(KERN_INFO "btrfs: max_extent at %llu\n",
215 (unsigned long long)info->max_extent);
216 }
217 break;
218 case Opt_max_inline: 190 case Opt_max_inline:
219 num = match_strdup(&args[0]); 191 num = match_strdup(&args[0]);
220 if (num) { 192 if (num) {
221 info->max_inline = btrfs_parse_size(num); 193 info->max_inline = memparse(num, NULL);
222 kfree(num); 194 kfree(num);
223 195
224 if (info->max_inline) { 196 if (info->max_inline) {
@@ -233,7 +205,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
233 case Opt_alloc_start: 205 case Opt_alloc_start:
234 num = match_strdup(&args[0]); 206 num = match_strdup(&args[0]);
235 if (num) { 207 if (num) {
236 info->alloc_start = btrfs_parse_size(num); 208 info->alloc_start = memparse(num, NULL);
237 kfree(num); 209 kfree(num);
238 printk(KERN_INFO 210 printk(KERN_INFO
239 "btrfs: allocations start at %llu\n", 211 "btrfs: allocations start at %llu\n",
@@ -273,7 +245,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
273 } 245 }
274 } 246 }
275out: 247out:
276 kfree(options); 248 kfree(orig);
277 return ret; 249 return ret;
278} 250}
279 251
@@ -284,12 +256,13 @@ out:
284 * only when we need to allocate a new super block. 256 * only when we need to allocate a new super block.
285 */ 257 */
286static int btrfs_parse_early_options(const char *options, fmode_t flags, 258static int btrfs_parse_early_options(const char *options, fmode_t flags,
287 void *holder, char **subvol_name, 259 void *holder, char **subvol_name, u64 *subvol_objectid,
288 struct btrfs_fs_devices **fs_devices) 260 struct btrfs_fs_devices **fs_devices)
289{ 261{
290 substring_t args[MAX_OPT_ARGS]; 262 substring_t args[MAX_OPT_ARGS];
291 char *opts, *p; 263 char *opts, *p;
292 int error = 0; 264 int error = 0;
265 int intarg;
293 266
294 if (!options) 267 if (!options)
295 goto out; 268 goto out;
@@ -312,6 +285,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
312 case Opt_subvol: 285 case Opt_subvol:
313 *subvol_name = match_strdup(&args[0]); 286 *subvol_name = match_strdup(&args[0]);
314 break; 287 break;
288 case Opt_subvolid:
289 intarg = 0;
290 error = match_int(&args[0], &intarg);
291 if (!error) {
292 /* we want the original fs_tree */
293 if (!intarg)
294 *subvol_objectid =
295 BTRFS_FS_TREE_OBJECTID;
296 else
297 *subvol_objectid = intarg;
298 }
299 break;
315 case Opt_device: 300 case Opt_device:
316 error = btrfs_scan_one_device(match_strdup(&args[0]), 301 error = btrfs_scan_one_device(match_strdup(&args[0]),
317 flags, holder, fs_devices); 302 flags, holder, fs_devices);
@@ -339,6 +324,110 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
339 return error; 324 return error;
340} 325}
341 326
327static struct dentry *get_default_root(struct super_block *sb,
328 u64 subvol_objectid)
329{
330 struct btrfs_root *root = sb->s_fs_info;
331 struct btrfs_root *new_root;
332 struct btrfs_dir_item *di;
333 struct btrfs_path *path;
334 struct btrfs_key location;
335 struct inode *inode;
336 struct dentry *dentry;
337 u64 dir_id;
338 int new = 0;
339
340 /*
341 * We have a specific subvol we want to mount, just setup location and
342 * go look up the root.
343 */
344 if (subvol_objectid) {
345 location.objectid = subvol_objectid;
346 location.type = BTRFS_ROOT_ITEM_KEY;
347 location.offset = (u64)-1;
348 goto find_root;
349 }
350
351 path = btrfs_alloc_path();
352 if (!path)
353 return ERR_PTR(-ENOMEM);
354 path->leave_spinning = 1;
355
356 /*
357 * Find the "default" dir item which points to the root item that we
358 * will mount by default if we haven't been given a specific subvolume
359 * to mount.
360 */
361 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
362 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
363 if (!di) {
364 /*
365 * Ok the default dir item isn't there. This is weird since
366 * it's always been there, but don't freak out, just try and
367 * mount to root most subvolume.
368 */
369 btrfs_free_path(path);
370 dir_id = BTRFS_FIRST_FREE_OBJECTID;
371 new_root = root->fs_info->fs_root;
372 goto setup_root;
373 }
374
375 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
376 btrfs_free_path(path);
377
378find_root:
379 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
380 if (IS_ERR(new_root))
381 return ERR_PTR(PTR_ERR(new_root));
382
383 if (btrfs_root_refs(&new_root->root_item) == 0)
384 return ERR_PTR(-ENOENT);
385
386 dir_id = btrfs_root_dirid(&new_root->root_item);
387setup_root:
388 location.objectid = dir_id;
389 location.type = BTRFS_INODE_ITEM_KEY;
390 location.offset = 0;
391
392 inode = btrfs_iget(sb, &location, new_root, &new);
393 if (!inode)
394 return ERR_PTR(-ENOMEM);
395
396 /*
397 * If we're just mounting the root most subvol put the inode and return
398 * a reference to the dentry. We will have already gotten a reference
399 * to the inode in btrfs_fill_super so we're good to go.
400 */
401 if (!new && sb->s_root->d_inode == inode) {
402 iput(inode);
403 return dget(sb->s_root);
404 }
405
406 if (new) {
407 const struct qstr name = { .name = "/", .len = 1 };
408
409 /*
410 * New inode, we need to make the dentry a sibling of s_root so
411 * everything gets cleaned up properly on unmount.
412 */
413 dentry = d_alloc(sb->s_root, &name);
414 if (!dentry) {
415 iput(inode);
416 return ERR_PTR(-ENOMEM);
417 }
418 d_splice_alias(inode, dentry);
419 } else {
420 /*
421 * We found the inode in cache, just find a dentry for it and
422 * put the reference to the inode we just got.
423 */
424 dentry = d_find_alias(inode);
425 iput(inode);
426 }
427
428 return dentry;
429}
430
342static int btrfs_fill_super(struct super_block *sb, 431static int btrfs_fill_super(struct super_block *sb,
343 struct btrfs_fs_devices *fs_devices, 432 struct btrfs_fs_devices *fs_devices,
344 void *data, int silent) 433 void *data, int silent)
@@ -372,7 +461,7 @@ static int btrfs_fill_super(struct super_block *sb,
372 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 461 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
373 key.type = BTRFS_INODE_ITEM_KEY; 462 key.type = BTRFS_INODE_ITEM_KEY;
374 key.offset = 0; 463 key.offset = 0;
375 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root); 464 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
376 if (IS_ERR(inode)) { 465 if (IS_ERR(inode)) {
377 err = PTR_ERR(inode); 466 err = PTR_ERR(inode);
378 goto fail_close; 467 goto fail_close;
@@ -384,12 +473,6 @@ static int btrfs_fill_super(struct super_block *sb,
384 err = -ENOMEM; 473 err = -ENOMEM;
385 goto fail_close; 474 goto fail_close;
386 } 475 }
387#if 0
388 /* this does the super kobj at the same time */
389 err = btrfs_sysfs_add_super(tree_root->fs_info);
390 if (err)
391 goto fail_close;
392#endif
393 476
394 sb->s_root = root_dentry; 477 sb->s_root = root_dentry;
395 478
@@ -433,9 +516,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
433 seq_puts(seq, ",nodatacow"); 516 seq_puts(seq, ",nodatacow");
434 if (btrfs_test_opt(root, NOBARRIER)) 517 if (btrfs_test_opt(root, NOBARRIER))
435 seq_puts(seq, ",nobarrier"); 518 seq_puts(seq, ",nobarrier");
436 if (info->max_extent != (u64)-1)
437 seq_printf(seq, ",max_extent=%llu",
438 (unsigned long long)info->max_extent);
439 if (info->max_inline != 8192 * 1024) 519 if (info->max_inline != 8192 * 1024)
440 seq_printf(seq, ",max_inline=%llu", 520 seq_printf(seq, ",max_inline=%llu",
441 (unsigned long long)info->max_inline); 521 (unsigned long long)info->max_inline);
@@ -481,19 +561,22 @@ static int btrfs_test_super(struct super_block *s, void *data)
481static int btrfs_get_sb(struct file_system_type *fs_type, int flags, 561static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
482 const char *dev_name, void *data, struct vfsmount *mnt) 562 const char *dev_name, void *data, struct vfsmount *mnt)
483{ 563{
484 char *subvol_name = NULL;
485 struct block_device *bdev = NULL; 564 struct block_device *bdev = NULL;
486 struct super_block *s; 565 struct super_block *s;
487 struct dentry *root; 566 struct dentry *root;
488 struct btrfs_fs_devices *fs_devices = NULL; 567 struct btrfs_fs_devices *fs_devices = NULL;
489 fmode_t mode = FMODE_READ; 568 fmode_t mode = FMODE_READ;
569 char *subvol_name = NULL;
570 u64 subvol_objectid = 0;
490 int error = 0; 571 int error = 0;
572 int found = 0;
491 573
492 if (!(flags & MS_RDONLY)) 574 if (!(flags & MS_RDONLY))
493 mode |= FMODE_WRITE; 575 mode |= FMODE_WRITE;
494 576
495 error = btrfs_parse_early_options(data, mode, fs_type, 577 error = btrfs_parse_early_options(data, mode, fs_type,
496 &subvol_name, &fs_devices); 578 &subvol_name, &subvol_objectid,
579 &fs_devices);
497 if (error) 580 if (error)
498 return error; 581 return error;
499 582
@@ -522,6 +605,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
522 goto error_close_devices; 605 goto error_close_devices;
523 } 606 }
524 607
608 found = 1;
525 btrfs_close_devices(fs_devices); 609 btrfs_close_devices(fs_devices);
526 } else { 610 } else {
527 char b[BDEVNAME_SIZE]; 611 char b[BDEVNAME_SIZE];
@@ -539,25 +623,35 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
539 s->s_flags |= MS_ACTIVE; 623 s->s_flags |= MS_ACTIVE;
540 } 624 }
541 625
542 if (!strcmp(subvol_name, ".")) 626 root = get_default_root(s, subvol_objectid);
543 root = dget(s->s_root); 627 if (IS_ERR(root)) {
544 else { 628 error = PTR_ERR(root);
545 mutex_lock(&s->s_root->d_inode->i_mutex); 629 deactivate_locked_super(s);
546 root = lookup_one_len(subvol_name, s->s_root, 630 goto error;
631 }
632 /* if they gave us a subvolume name bind mount into that */
633 if (strcmp(subvol_name, ".")) {
634 struct dentry *new_root;
635 mutex_lock(&root->d_inode->i_mutex);
636 new_root = lookup_one_len(subvol_name, root,
547 strlen(subvol_name)); 637 strlen(subvol_name));
548 mutex_unlock(&s->s_root->d_inode->i_mutex); 638 mutex_unlock(&root->d_inode->i_mutex);
549 639
550 if (IS_ERR(root)) { 640 if (IS_ERR(new_root)) {
551 deactivate_locked_super(s); 641 deactivate_locked_super(s);
552 error = PTR_ERR(root); 642 error = PTR_ERR(new_root);
553 goto error_free_subvol_name; 643 dput(root);
644 goto error_close_devices;
554 } 645 }
555 if (!root->d_inode) { 646 if (!new_root->d_inode) {
556 dput(root); 647 dput(root);
648 dput(new_root);
557 deactivate_locked_super(s); 649 deactivate_locked_super(s);
558 error = -ENXIO; 650 error = -ENXIO;
559 goto error_free_subvol_name; 651 goto error_close_devices;
560 } 652 }
653 dput(root);
654 root = new_root;
561 } 655 }
562 656
563 mnt->mnt_sb = s; 657 mnt->mnt_sb = s;
@@ -572,6 +666,7 @@ error_close_devices:
572 btrfs_close_devices(fs_devices); 666 btrfs_close_devices(fs_devices);
573error_free_subvol_name: 667error_free_subvol_name:
574 kfree(subvol_name); 668 kfree(subvol_name);
669error:
575 return error; 670 return error;
576} 671}
577 672
@@ -616,14 +711,37 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
616{ 711{
617 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 712 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
618 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 713 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
714 struct list_head *head = &root->fs_info->space_info;
715 struct btrfs_space_info *found;
716 u64 total_used = 0;
717 u64 data_used = 0;
619 int bits = dentry->d_sb->s_blocksize_bits; 718 int bits = dentry->d_sb->s_blocksize_bits;
620 __be32 *fsid = (__be32 *)root->fs_info->fsid; 719 __be32 *fsid = (__be32 *)root->fs_info->fsid;
621 720
721 rcu_read_lock();
722 list_for_each_entry_rcu(found, head, list) {
723 if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
724 BTRFS_BLOCK_GROUP_RAID10|
725 BTRFS_BLOCK_GROUP_RAID1)) {
726 total_used += found->bytes_used;
727 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
728 data_used += found->bytes_used;
729 else
730 data_used += found->total_bytes;
731 }
732
733 total_used += found->bytes_used;
734 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
735 data_used += found->bytes_used;
736 else
737 data_used += found->total_bytes;
738 }
739 rcu_read_unlock();
740
622 buf->f_namelen = BTRFS_NAME_LEN; 741 buf->f_namelen = BTRFS_NAME_LEN;
623 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 742 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
624 buf->f_bfree = buf->f_blocks - 743 buf->f_bfree = buf->f_blocks - (total_used >> bits);
625 (btrfs_super_bytes_used(disk_super) >> bits); 744 buf->f_bavail = buf->f_blocks - (data_used >> bits);
626 buf->f_bavail = buf->f_bfree;
627 buf->f_bsize = dentry->d_sb->s_blocksize; 745 buf->f_bsize = dentry->d_sb->s_blocksize;
628 buf->f_type = BTRFS_SUPER_MAGIC; 746 buf->f_type = BTRFS_SUPER_MAGIC;
629 747
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a240b6fa81df..4ce16ef702a3 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -164,12 +164,12 @@ static void btrfs_root_release(struct kobject *kobj)
164 complete(&root->kobj_unregister); 164 complete(&root->kobj_unregister);
165} 165}
166 166
167static struct sysfs_ops btrfs_super_attr_ops = { 167static const struct sysfs_ops btrfs_super_attr_ops = {
168 .show = btrfs_super_attr_show, 168 .show = btrfs_super_attr_show,
169 .store = btrfs_super_attr_store, 169 .store = btrfs_super_attr_store,
170}; 170};
171 171
172static struct sysfs_ops btrfs_root_attr_ops = { 172static const struct sysfs_ops btrfs_root_attr_ops = {
173 .show = btrfs_root_attr_show, 173 .show = btrfs_root_attr_show,
174 .store = btrfs_root_attr_store, 174 .store = btrfs_root_attr_store,
175}; 175};
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b2acc79f1b34..2cb116099b90 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20#include <linux/sched.h> 21#include <linux/sched.h>
21#include <linux/writeback.h> 22#include <linux/writeback.h>
22#include <linux/pagemap.h> 23#include <linux/pagemap.h>
@@ -69,7 +70,7 @@ static noinline int join_transaction(struct btrfs_root *root)
69 cur_trans->commit_done = 0; 70 cur_trans->commit_done = 0;
70 cur_trans->start_time = get_seconds(); 71 cur_trans->start_time = get_seconds();
71 72
72 cur_trans->delayed_refs.root.rb_node = NULL; 73 cur_trans->delayed_refs.root = RB_ROOT;
73 cur_trans->delayed_refs.num_entries = 0; 74 cur_trans->delayed_refs.num_entries = 0;
74 cur_trans->delayed_refs.num_heads_ready = 0; 75 cur_trans->delayed_refs.num_heads_ready = 0;
75 cur_trans->delayed_refs.num_heads = 0; 76 cur_trans->delayed_refs.num_heads = 0;
@@ -147,18 +148,13 @@ static void wait_current_trans(struct btrfs_root *root)
147 while (1) { 148 while (1) {
148 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 149 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
149 TASK_UNINTERRUPTIBLE); 150 TASK_UNINTERRUPTIBLE);
150 if (cur_trans->blocked) { 151 if (!cur_trans->blocked)
151 mutex_unlock(&root->fs_info->trans_mutex);
152 schedule();
153 mutex_lock(&root->fs_info->trans_mutex);
154 finish_wait(&root->fs_info->transaction_wait,
155 &wait);
156 } else {
157 finish_wait(&root->fs_info->transaction_wait,
158 &wait);
159 break; 152 break;
160 } 153 mutex_unlock(&root->fs_info->trans_mutex);
154 schedule();
155 mutex_lock(&root->fs_info->trans_mutex);
161 } 156 }
157 finish_wait(&root->fs_info->transaction_wait, &wait);
162 put_transaction(cur_trans); 158 put_transaction(cur_trans);
163 } 159 }
164} 160}
@@ -760,10 +756,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
760 struct btrfs_root_item *new_root_item; 756 struct btrfs_root_item *new_root_item;
761 struct btrfs_root *tree_root = fs_info->tree_root; 757 struct btrfs_root *tree_root = fs_info->tree_root;
762 struct btrfs_root *root = pending->root; 758 struct btrfs_root *root = pending->root;
759 struct btrfs_root *parent_root;
760 struct inode *parent_inode;
763 struct extent_buffer *tmp; 761 struct extent_buffer *tmp;
764 struct extent_buffer *old; 762 struct extent_buffer *old;
765 int ret; 763 int ret;
766 u64 objectid; 764 u64 objectid;
765 int namelen;
766 u64 index = 0;
767
768 parent_inode = pending->dentry->d_parent->d_inode;
769 parent_root = BTRFS_I(parent_inode)->root;
767 770
768 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 771 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
769 if (!new_root_item) { 772 if (!new_root_item) {
@@ -774,79 +777,59 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
774 if (ret) 777 if (ret)
775 goto fail; 778 goto fail;
776 779
777 record_root_in_trans(trans, root);
778 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
779 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
780
781 key.objectid = objectid; 780 key.objectid = objectid;
782 /* record when the snapshot was created in key.offset */ 781 /* record when the snapshot was created in key.offset */
783 key.offset = trans->transid; 782 key.offset = trans->transid;
784 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 783 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
785 784
786 old = btrfs_lock_root_node(root);
787 btrfs_cow_block(trans, root, old, NULL, 0, &old);
788 btrfs_set_lock_blocking(old);
789
790 btrfs_copy_root(trans, root, old, &tmp, objectid);
791 btrfs_tree_unlock(old);
792 free_extent_buffer(old);
793
794 btrfs_set_root_node(new_root_item, tmp);
795 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
796 new_root_item);
797 btrfs_tree_unlock(tmp);
798 free_extent_buffer(tmp);
799 if (ret)
800 goto fail;
801
802 key.offset = (u64)-1;
803 memcpy(&pending->root_key, &key, sizeof(key)); 785 memcpy(&pending->root_key, &key, sizeof(key));
804fail: 786 pending->root_key.offset = (u64)-1;
805 kfree(new_root_item);
806 return ret;
807}
808
809static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
810 struct btrfs_pending_snapshot *pending)
811{
812 int ret;
813 int namelen;
814 u64 index = 0;
815 struct btrfs_trans_handle *trans;
816 struct inode *parent_inode;
817 struct btrfs_root *parent_root;
818
819 parent_inode = pending->dentry->d_parent->d_inode;
820 parent_root = BTRFS_I(parent_inode)->root;
821 trans = btrfs_join_transaction(parent_root, 1);
822 787
788 record_root_in_trans(trans, parent_root);
823 /* 789 /*
824 * insert the directory item 790 * insert the directory item
825 */ 791 */
826 namelen = strlen(pending->name); 792 namelen = strlen(pending->name);
827 ret = btrfs_set_inode_index(parent_inode, &index); 793 ret = btrfs_set_inode_index(parent_inode, &index);
794 BUG_ON(ret);
828 ret = btrfs_insert_dir_item(trans, parent_root, 795 ret = btrfs_insert_dir_item(trans, parent_root,
829 pending->name, namelen, 796 pending->name, namelen,
830 parent_inode->i_ino, 797 parent_inode->i_ino,
831 &pending->root_key, BTRFS_FT_DIR, index); 798 &pending->root_key, BTRFS_FT_DIR, index);
832 799 BUG_ON(ret);
833 if (ret)
834 goto fail;
835 800
836 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); 801 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
837 ret = btrfs_update_inode(trans, parent_root, parent_inode); 802 ret = btrfs_update_inode(trans, parent_root, parent_inode);
838 BUG_ON(ret); 803 BUG_ON(ret);
839 804
805 record_root_in_trans(trans, root);
806 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
807 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
808
809 old = btrfs_lock_root_node(root);
810 btrfs_cow_block(trans, root, old, NULL, 0, &old);
811 btrfs_set_lock_blocking(old);
812
813 btrfs_copy_root(trans, root, old, &tmp, objectid);
814 btrfs_tree_unlock(old);
815 free_extent_buffer(old);
816
817 btrfs_set_root_node(new_root_item, tmp);
818 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
819 new_root_item);
820 BUG_ON(ret);
821 btrfs_tree_unlock(tmp);
822 free_extent_buffer(tmp);
823
840 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, 824 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
841 pending->root_key.objectid, 825 pending->root_key.objectid,
842 parent_root->root_key.objectid, 826 parent_root->root_key.objectid,
843 parent_inode->i_ino, index, pending->name, 827 parent_inode->i_ino, index, pending->name,
844 namelen); 828 namelen);
845
846 BUG_ON(ret); 829 BUG_ON(ret);
847 830
848fail: 831fail:
849 btrfs_end_transaction(trans, fs_info->fs_root); 832 kfree(new_root_item);
850 return ret; 833 return ret;
851} 834}
852 835
@@ -867,25 +850,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
867 return 0; 850 return 0;
868} 851}
869 852
870static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
871 struct btrfs_fs_info *fs_info)
872{
873 struct btrfs_pending_snapshot *pending;
874 struct list_head *head = &trans->transaction->pending_snapshots;
875 int ret;
876
877 while (!list_empty(head)) {
878 pending = list_entry(head->next,
879 struct btrfs_pending_snapshot, list);
880 ret = finish_pending_snapshot(fs_info, pending);
881 BUG_ON(ret);
882 list_del(&pending->list);
883 kfree(pending->name);
884 kfree(pending);
885 }
886 return 0;
887}
888
889static void update_super_roots(struct btrfs_root *root) 853static void update_super_roots(struct btrfs_root *root)
890{ 854{
891 struct btrfs_root_item *root_item; 855 struct btrfs_root_item *root_item;
@@ -997,13 +961,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
997 961
998 mutex_unlock(&root->fs_info->trans_mutex); 962 mutex_unlock(&root->fs_info->trans_mutex);
999 963
1000 if (flush_on_commit) { 964 if (flush_on_commit || snap_pending) {
1001 btrfs_start_delalloc_inodes(root, 1); 965 btrfs_start_delalloc_inodes(root, 1);
1002 ret = btrfs_wait_ordered_extents(root, 0, 1); 966 ret = btrfs_wait_ordered_extents(root, 0, 1);
1003 BUG_ON(ret); 967 BUG_ON(ret);
1004 } else if (snap_pending) {
1005 ret = btrfs_wait_ordered_extents(root, 0, 1);
1006 BUG_ON(ret);
1007 } 968 }
1008 969
1009 /* 970 /*
@@ -1100,9 +1061,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1100 1061
1101 btrfs_finish_extent_commit(trans, root); 1062 btrfs_finish_extent_commit(trans, root);
1102 1063
1103 /* do the directory inserts of any pending snapshot creations */
1104 finish_pending_snapshots(trans, root->fs_info);
1105
1106 mutex_lock(&root->fs_info->trans_mutex); 1064 mutex_lock(&root->fs_info->trans_mutex);
1107 1065
1108 cur_trans->commit_done = 1; 1066 cur_trans->commit_done = 1;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4a9434b622ec..af57dd2b43d4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "transaction.h" 22#include "transaction.h"
22#include "disk-io.h" 23#include "disk-io.h"
@@ -445,7 +446,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
445 key.objectid = objectid; 446 key.objectid = objectid;
446 key.type = BTRFS_INODE_ITEM_KEY; 447 key.type = BTRFS_INODE_ITEM_KEY;
447 key.offset = 0; 448 key.offset = 0;
448 inode = btrfs_iget(root->fs_info->sb, &key, root); 449 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
449 if (IS_ERR(inode)) { 450 if (IS_ERR(inode)) {
450 inode = NULL; 451 inode = NULL;
451 } else if (is_bad_inode(inode)) { 452 } else if (is_bad_inode(inode)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 198cff28766d..8db7b14bbae8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/buffer_head.h> 21#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 22#include <linux/blkdev.h>
22#include <linux/random.h> 23#include <linux/random.h>
@@ -256,13 +257,13 @@ loop_lock:
256 wake_up(&fs_info->async_submit_wait); 257 wake_up(&fs_info->async_submit_wait);
257 258
258 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 259 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
259 submit_bio(cur->bi_rw, cur);
260 num_run++;
261 batch_run++;
262 260
263 if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) 261 if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
264 num_sync_run++; 262 num_sync_run++;
265 263
264 submit_bio(cur->bi_rw, cur);
265 num_run++;
266 batch_run++;
266 if (need_resched()) { 267 if (need_resched()) {
267 if (num_sync_run) { 268 if (num_sync_run) {
268 blk_run_backing_dev(bdi, NULL); 269 blk_run_backing_dev(bdi, NULL);
@@ -325,16 +326,6 @@ loop_lock:
325 num_sync_run = 0; 326 num_sync_run = 0;
326 blk_run_backing_dev(bdi, NULL); 327 blk_run_backing_dev(bdi, NULL);
327 } 328 }
328
329 cond_resched();
330 if (again)
331 goto loop;
332
333 spin_lock(&device->io_lock);
334 if (device->pending_bios.head || device->pending_sync_bios.head)
335 goto loop_lock;
336 spin_unlock(&device->io_lock);
337
338 /* 329 /*
339 * IO has already been through a long path to get here. Checksumming, 330 * IO has already been through a long path to get here. Checksumming,
340 * async helper threads, perhaps compression. We've done a pretty 331 * async helper threads, perhaps compression. We've done a pretty
@@ -346,6 +337,16 @@ loop_lock:
346 * cared about found its way down here. 337 * cared about found its way down here.
347 */ 338 */
348 blk_run_backing_dev(bdi, NULL); 339 blk_run_backing_dev(bdi, NULL);
340
341 cond_resched();
342 if (again)
343 goto loop;
344
345 spin_lock(&device->io_lock);
346 if (device->pending_bios.head || device->pending_sync_bios.head)
347 goto loop_lock;
348 spin_unlock(&device->io_lock);
349
349done: 350done:
350 return 0; 351 return 0;
351} 352}
@@ -365,6 +366,7 @@ static noinline int device_list_add(const char *path,
365 struct btrfs_device *device; 366 struct btrfs_device *device;
366 struct btrfs_fs_devices *fs_devices; 367 struct btrfs_fs_devices *fs_devices;
367 u64 found_transid = btrfs_super_generation(disk_super); 368 u64 found_transid = btrfs_super_generation(disk_super);
369 char *name;
368 370
369 fs_devices = find_fsid(disk_super->fsid); 371 fs_devices = find_fsid(disk_super->fsid);
370 if (!fs_devices) { 372 if (!fs_devices) {
@@ -411,6 +413,12 @@ static noinline int device_list_add(const char *path,
411 413
412 device->fs_devices = fs_devices; 414 device->fs_devices = fs_devices;
413 fs_devices->num_devices++; 415 fs_devices->num_devices++;
416 } else if (strcmp(device->name, path)) {
417 name = kstrdup(path, GFP_NOFS);
418 if (!name)
419 return -ENOMEM;
420 kfree(device->name);
421 device->name = name;
414 } 422 }
415 423
416 if (found_transid > fs_devices->latest_trans) { 424 if (found_transid > fs_devices->latest_trans) {
@@ -592,7 +600,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
592 goto error_close; 600 goto error_close;
593 601
594 disk_super = (struct btrfs_super_block *)bh->b_data; 602 disk_super = (struct btrfs_super_block *)bh->b_data;
595 devid = le64_to_cpu(disk_super->dev_item.devid); 603 devid = btrfs_stack_device_id(&disk_super->dev_item);
596 if (devid != device->devid) 604 if (devid != device->devid)
597 goto error_brelse; 605 goto error_brelse;
598 606
@@ -694,7 +702,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
694 goto error_close; 702 goto error_close;
695 } 703 }
696 disk_super = (struct btrfs_super_block *)bh->b_data; 704 disk_super = (struct btrfs_super_block *)bh->b_data;
697 devid = le64_to_cpu(disk_super->dev_item.devid); 705 devid = btrfs_stack_device_id(&disk_super->dev_item);
698 transid = btrfs_super_generation(disk_super); 706 transid = btrfs_super_generation(disk_super);
699 if (disk_super->label[0]) 707 if (disk_super->label[0])
700 printk(KERN_INFO "device label %s ", disk_super->label); 708 printk(KERN_INFO "device label %s ", disk_super->label);
@@ -1135,7 +1143,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1135 root->fs_info->avail_metadata_alloc_bits; 1143 root->fs_info->avail_metadata_alloc_bits;
1136 1144
1137 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1145 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1138 root->fs_info->fs_devices->rw_devices <= 4) { 1146 root->fs_info->fs_devices->num_devices <= 4) {
1139 printk(KERN_ERR "btrfs: unable to go below four devices " 1147 printk(KERN_ERR "btrfs: unable to go below four devices "
1140 "on raid10\n"); 1148 "on raid10\n");
1141 ret = -EINVAL; 1149 ret = -EINVAL;
@@ -1143,7 +1151,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1143 } 1151 }
1144 1152
1145 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1153 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1146 root->fs_info->fs_devices->rw_devices <= 2) { 1154 root->fs_info->fs_devices->num_devices <= 2) {
1147 printk(KERN_ERR "btrfs: unable to go below two " 1155 printk(KERN_ERR "btrfs: unable to go below two "
1148 "devices on raid1\n"); 1156 "devices on raid1\n");
1149 ret = -EINVAL; 1157 ret = -EINVAL;
@@ -1187,7 +1195,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1187 goto error_close; 1195 goto error_close;
1188 } 1196 }
1189 disk_super = (struct btrfs_super_block *)bh->b_data; 1197 disk_super = (struct btrfs_super_block *)bh->b_data;
1190 devid = le64_to_cpu(disk_super->dev_item.devid); 1198 devid = btrfs_stack_device_id(&disk_super->dev_item);
1191 dev_uuid = disk_super->dev_item.uuid; 1199 dev_uuid = disk_super->dev_item.uuid;
1192 device = btrfs_find_device(root, devid, dev_uuid, 1200 device = btrfs_find_device(root, devid, dev_uuid,
1193 disk_super->fsid); 1201 disk_super->fsid);
@@ -1434,8 +1442,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1434 return -EINVAL; 1442 return -EINVAL;
1435 1443
1436 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); 1444 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
1437 if (!bdev) 1445 if (IS_ERR(bdev))
1438 return -EIO; 1446 return PTR_ERR(bdev);
1439 1447
1440 if (root->fs_info->fs_devices->seeding) { 1448 if (root->fs_info->fs_devices->seeding) {
1441 seeding_dev = 1; 1449 seeding_dev = 1;
@@ -2191,9 +2199,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2191 min_stripes = 2; 2199 min_stripes = 2;
2192 } 2200 }
2193 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2201 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2194 num_stripes = min_t(u64, 2, fs_devices->rw_devices); 2202 if (fs_devices->rw_devices < 2)
2195 if (num_stripes < 2)
2196 return -ENOSPC; 2203 return -ENOSPC;
2204 num_stripes = 2;
2197 min_stripes = 2; 2205 min_stripes = 2;
2198 } 2206 }
2199 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2207 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
@@ -2237,8 +2245,16 @@ again:
2237 do_div(calc_size, stripe_len); 2245 do_div(calc_size, stripe_len);
2238 calc_size *= stripe_len; 2246 calc_size *= stripe_len;
2239 } 2247 }
2248
2240 /* we don't want tiny stripes */ 2249 /* we don't want tiny stripes */
2241 calc_size = max_t(u64, min_stripe_size, calc_size); 2250 if (!looped)
2251 calc_size = max_t(u64, min_stripe_size, calc_size);
2252
2253 /*
2254 * we're about to do_div by the stripe_len so lets make sure
2255 * we end up with something bigger than a stripe
2256 */
2257 calc_size = max_t(u64, calc_size, stripe_len * 4);
2242 2258
2243 do_div(calc_size, stripe_len); 2259 do_div(calc_size, stripe_len);
2244 calc_size *= stripe_len; 2260 calc_size *= stripe_len;
@@ -2538,6 +2554,11 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2538 if (!em) 2554 if (!em)
2539 return 1; 2555 return 1;
2540 2556
2557 if (btrfs_test_opt(root, DEGRADED)) {
2558 free_extent_map(em);
2559 return 0;
2560 }
2561
2541 map = (struct map_lookup *)em->bdev; 2562 map = (struct map_lookup *)em->bdev;
2542 for (i = 0; i < map->num_stripes; i++) { 2563 for (i = 0; i < map->num_stripes; i++) {
2543 if (!map->stripes[i].dev->writeable) { 2564 if (!map->stripes[i].dev->writeable) {
@@ -2649,8 +2670,10 @@ again:
2649 em = lookup_extent_mapping(em_tree, logical, *length); 2670 em = lookup_extent_mapping(em_tree, logical, *length);
2650 read_unlock(&em_tree->lock); 2671 read_unlock(&em_tree->lock);
2651 2672
2652 if (!em && unplug_page) 2673 if (!em && unplug_page) {
2674 kfree(multi);
2653 return 0; 2675 return 0;
2676 }
2654 2677
2655 if (!em) { 2678 if (!em) {
2656 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2679 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
@@ -3375,6 +3398,8 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
3375 key.type = 0; 3398 key.type = 0;
3376again: 3399again:
3377 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3400 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3401 if (ret < 0)
3402 goto error;
3378 while (1) { 3403 while (1) {
3379 leaf = path->nodes[0]; 3404 leaf = path->nodes[0];
3380 slot = path->slots[0]; 3405 slot = path->slots[0];
diff --git a/fs/buffer.c b/fs/buffer.c
index 6fa530256bfd..c9c266db0624 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2893,7 +2893,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2893 2893
2894 /* 2894 /*
2895 * The page straddles i_size. It must be zeroed out on each and every 2895 * The page straddles i_size. It must be zeroed out on each and every
2896 * writepage invokation because it may be mmapped. "A file is mapped 2896 * writepage invocation because it may be mmapped. "A file is mapped
2897 * in multiples of the page size. For a file that is not a multiple of 2897 * in multiples of the page size. For a file that is not a multiple of
2898 * the page size, the remaining memory is zeroed when mapped, and 2898 * the page size, the remaining memory is zeroed when mapped, and
2899 * writes to that region are not written out to the file." 2899 * writes to that region are not written out to the file."
@@ -3265,7 +3265,7 @@ static void recalc_bh_state(void)
3265 3265
3266struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) 3266struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3267{ 3267{
3268 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); 3268 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3269 if (ret) { 3269 if (ret) {
3270 INIT_LIST_HEAD(&ret->b_assoc_buffers); 3270 INIT_LIST_HEAD(&ret->b_assoc_buffers);
3271 get_cpu_var(bh_accounting).nr++; 3271 get_cpu_var(bh_accounting).nr++;
@@ -3352,15 +3352,6 @@ int bh_submit_read(struct buffer_head *bh)
3352} 3352}
3353EXPORT_SYMBOL(bh_submit_read); 3353EXPORT_SYMBOL(bh_submit_read);
3354 3354
3355static void
3356init_buffer_head(void *data)
3357{
3358 struct buffer_head *bh = data;
3359
3360 memset(bh, 0, sizeof(*bh));
3361 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3362}
3363
3364void __init buffer_init(void) 3355void __init buffer_init(void)
3365{ 3356{
3366 int nrpages; 3357 int nrpages;
@@ -3369,7 +3360,7 @@ void __init buffer_init(void)
3369 sizeof(struct buffer_head), 0, 3360 sizeof(struct buffer_head), 0,
3370 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| 3361 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3371 SLAB_MEM_SPREAD), 3362 SLAB_MEM_SPREAD),
3372 init_buffer_head); 3363 NULL);
3373 3364
3374 /* 3365 /*
3375 * Limit the bh occupancy to 10% of ZONE_NORMAL 3366 * Limit the bh occupancy to 10% of ZONE_NORMAL
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 27089311fbea..37fe101a4e0d 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -9,6 +9,7 @@
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/slab.h>
12#include <linux/mount.h> 13#include <linux/mount.h>
13#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
14#include "internal.h" 15#include "internal.h"
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 14ac4806e291..d5db84a1ee0d 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -19,6 +19,7 @@
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/namei.h> 20#include <linux/namei.h>
21#include <linux/security.h> 21#include <linux/security.h>
22#include <linux/slab.h>
22#include "internal.h" 23#include "internal.h"
23 24
24#define CACHEFILES_KEYBUF_SIZE 512 25#define CACHEFILES_KEYBUF_SIZE 512
@@ -348,7 +349,17 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
348 dir = dget_parent(object->dentry); 349 dir = dget_parent(object->dentry);
349 350
350 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); 351 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
351 ret = cachefiles_bury_object(cache, dir, object->dentry); 352
353 /* we need to check that our parent is _still_ our parent - it may have
354 * been renamed */
355 if (dir == object->dentry->d_parent) {
356 ret = cachefiles_bury_object(cache, dir, object->dentry);
357 } else {
358 /* it got moved, presumably by cachefilesd culling it, so it's
359 * no longer in the key path and we can ignore it */
360 mutex_unlock(&dir->d_inode->i_mutex);
361 ret = 0;
362 }
352 363
353 dput(dir); 364 dput(dir);
354 _leave(" = %d", ret); 365 _leave(" = %d", ret);
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 1d8332563863..0f0d41fbb03f 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12#include <linux/mount.h> 12#include <linux/mount.h>
13#include <linux/slab.h>
13#include <linux/file.h> 14#include <linux/file.h>
14#include "internal.h" 15#include "internal.h"
15 16
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index f3e7a0bf068b..e18b183b47e1 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -16,6 +16,7 @@
16#include <linux/fsnotify.h> 16#include <linux/fsnotify.h>
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/xattr.h> 18#include <linux/xattr.h>
19#include <linux/slab.h>
19#include "internal.h" 20#include "internal.h"
20 21
21static const char cachefiles_xattr_cache[] = 22static const char cachefiles_xattr_cache[] =
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 000000000000..04b8280582a9
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,27 @@
1config CEPH_FS
2 tristate "Ceph distributed file system (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL
4 select LIBCRC32C
5 select CONFIG_CRYPTO_AES
6 help
7 Choose Y or M here to include support for mounting the
8 experimental Ceph distributed file system. Ceph is an extremely
9 scalable file system designed to provide high performance,
10 reliable access to petabytes of storage.
11
12 More information at http://ceph.newdream.net/.
13
14 If unsure, say N.
15
16config CEPH_FS_PRETTYDEBUG
17 bool "Include file:line in ceph debug output"
18 depends on CEPH_FS
19 default n
20 help
21 If you say Y here, debug output will include a filename and
22 line to aid debugging. This icnreases kernel size and slows
23 execution slightly when debug call sites are enabled (e.g.,
24 via CONFIG_DYNAMIC_DEBUG).
25
26 If unsure, say N.
27
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 000000000000..6a660e610be8
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,39 @@
1#
2# Makefile for CEPH filesystem.
3#
4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_FS) += ceph.o
8
9ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \
12 mds_client.o mdsmap.o \
13 mon_client.o \
14 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
15 debugfs.o \
16 auth.o auth_none.o \
17 crypto.o armor.o \
18 auth_x.o \
19 ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
20
21else
22#Otherwise we were called directly from the command
23# line; invoke the kernel build system.
24
25KERNELDIR ?= /lib/modules/$(shell uname -r)/build
26PWD := $(shell pwd)
27
28default: all
29
30all:
31 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
32
33modules_install:
34 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
35
36clean:
37 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
38
39endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 000000000000..18352fab37c0
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
1#
2# The following files are shared by (and manually synchronized
3# between) the Ceph userland and kernel client.
4#
5# userland kernel
6src/include/ceph_fs.h fs/ceph/ceph_fs.h
7src/include/ceph_fs.cc fs/ceph/ceph_fs.c
8src/include/msgr.h fs/ceph/msgr.h
9src/include/rados.h fs/ceph/rados.h
10src/include/ceph_strings.cc fs/ceph/ceph_strings.c
11src/include/ceph_frag.h fs/ceph/ceph_frag.h
12src/include/ceph_frag.cc fs/ceph/ceph_frag.c
13src/include/ceph_hash.h fs/ceph/ceph_hash.h
14src/include/ceph_hash.cc fs/ceph/ceph_hash.c
15src/crush/crush.c fs/ceph/crush/crush.c
16src/crush/crush.h fs/ceph/crush/crush.h
17src/crush/mapper.c fs/ceph/crush/mapper.c
18src/crush/mapper.h fs/ceph/crush/mapper.h
19src/crush/hash.h fs/ceph/crush/hash.h
20src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 000000000000..412593703d1e
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1193 @@
1#include "ceph_debug.h"
2
3#include <linux/backing-dev.h>
4#include <linux/fs.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/writeback.h> /* generic_writepages */
8#include <linux/slab.h>
9#include <linux/pagevec.h>
10#include <linux/task_io_accounting_ops.h>
11
12#include "super.h"
13#include "osd_client.h"
14
15/*
16 * Ceph address space ops.
17 *
18 * There are a few funny things going on here.
19 *
20 * The page->private field is used to reference a struct
21 * ceph_snap_context for _every_ dirty page. This indicates which
22 * snapshot the page was logically dirtied in, and thus which snap
23 * context needs to be associated with the osd write during writeback.
24 *
25 * Similarly, struct ceph_inode_info maintains a set of counters to
26 * count dirty pages on the inode. In the absense of snapshots,
27 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
28 *
29 * When a snapshot is taken (that is, when the client receives
30 * notification that a snapshot was taken), each inode with caps and
31 * with dirty pages (dirty pages implies there is a cap) gets a new
32 * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
33 * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
34 * moved to capsnap->dirty. (Unless a sync write is currently in
35 * progress. In that case, the capsnap is said to be "pending", new
36 * writes cannot start, and the capsnap isn't "finalized" until the
37 * write completes (or fails) and a final size/mtime for the inode for
38 * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
39 *
40 * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
41 * we look for the first capsnap in i_cap_snaps and write out pages in
42 * that snap context _only_. Then we move on to the next capsnap,
43 * eventually reaching the "live" or "head" context (i.e., pages that
44 * are not yet snapped) and are writing the most recently dirtied
45 * pages.
46 *
47 * Invalidate and so forth must take care to ensure the dirty page
48 * accounting is preserved.
49 */
50
51#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
52#define CONGESTION_OFF_THRESH(congestion_kb) \
53 (CONGESTION_ON_THRESH(congestion_kb) - \
54 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
55
56
57
58/*
59 * Dirty a page. Optimistically adjust accounting, on the assumption
60 * that we won't race with invalidate. If we do, readjust.
61 */
62static int ceph_set_page_dirty(struct page *page)
63{
64 struct address_space *mapping = page->mapping;
65 struct inode *inode;
66 struct ceph_inode_info *ci;
67 int undo = 0;
68 struct ceph_snap_context *snapc;
69
70 if (unlikely(!mapping))
71 return !TestSetPageDirty(page);
72
73 if (TestSetPageDirty(page)) {
74 dout("%p set_page_dirty %p idx %lu -- already dirty\n",
75 mapping->host, page, page->index);
76 return 0;
77 }
78
79 inode = mapping->host;
80 ci = ceph_inode(inode);
81
82 /*
83 * Note that we're grabbing a snapc ref here without holding
84 * any locks!
85 */
86 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
87
88 /* dirty the head */
89 spin_lock(&inode->i_lock);
90 if (ci->i_wrbuffer_ref_head == 0)
91 ci->i_head_snapc = ceph_get_snap_context(snapc);
92 ++ci->i_wrbuffer_ref_head;
93 if (ci->i_wrbuffer_ref == 0)
94 igrab(inode);
95 ++ci->i_wrbuffer_ref;
96 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
97 "snapc %p seq %lld (%d snaps)\n",
98 mapping->host, page, page->index,
99 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
100 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
101 snapc, snapc->seq, snapc->num_snaps);
102 spin_unlock(&inode->i_lock);
103
104 /* now adjust page */
105 spin_lock_irq(&mapping->tree_lock);
106 if (page->mapping) { /* Race with truncate? */
107 WARN_ON_ONCE(!PageUptodate(page));
108
109 if (mapping_cap_account_dirty(mapping)) {
110 __inc_zone_page_state(page, NR_FILE_DIRTY);
111 __inc_bdi_stat(mapping->backing_dev_info,
112 BDI_RECLAIMABLE);
113 task_io_account_write(PAGE_CACHE_SIZE);
114 }
115 radix_tree_tag_set(&mapping->page_tree,
116 page_index(page), PAGECACHE_TAG_DIRTY);
117
118 /*
119 * Reference snap context in page->private. Also set
120 * PagePrivate so that we get invalidatepage callback.
121 */
122 page->private = (unsigned long)snapc;
123 SetPagePrivate(page);
124 } else {
125 dout("ANON set_page_dirty %p (raced truncate?)\n", page);
126 undo = 1;
127 }
128
129 spin_unlock_irq(&mapping->tree_lock);
130
131 if (undo)
132 /* whoops, we failed to dirty the page */
133 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
134
135 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
136
137 BUG_ON(!PageDirty(page));
138 return 1;
139}
140
141/*
142 * If we are truncating the full page (i.e. offset == 0), adjust the
143 * dirty page counters appropriately. Only called if there is private
144 * data on the page.
145 */
146static void ceph_invalidatepage(struct page *page, unsigned long offset)
147{
148 struct inode *inode;
149 struct ceph_inode_info *ci;
150 struct ceph_snap_context *snapc = (void *)page->private;
151
152 BUG_ON(!PageLocked(page));
153 BUG_ON(!page->private);
154 BUG_ON(!PagePrivate(page));
155 BUG_ON(!page->mapping);
156
157 inode = page->mapping->host;
158
159 /*
160 * We can get non-dirty pages here due to races between
161 * set_page_dirty and truncate_complete_page; just spit out a
162 * warning, in case we end up with accounting problems later.
163 */
164 if (!PageDirty(page))
165 pr_err("%p invalidatepage %p page not dirty\n", inode, page);
166
167 if (offset == 0)
168 ClearPageChecked(page);
169
170 ci = ceph_inode(inode);
171 if (offset == 0) {
172 dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
173 inode, page, page->index, offset);
174 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
175 ceph_put_snap_context(snapc);
176 page->private = 0;
177 ClearPagePrivate(page);
178 } else {
179 dout("%p invalidatepage %p idx %lu partial dirty page\n",
180 inode, page, page->index);
181 }
182}
183
184/* just a sanity check */
185static int ceph_releasepage(struct page *page, gfp_t g)
186{
187 struct inode *inode = page->mapping ? page->mapping->host : NULL;
188 dout("%p releasepage %p idx %lu\n", inode, page, page->index);
189 WARN_ON(PageDirty(page));
190 WARN_ON(page->private);
191 WARN_ON(PagePrivate(page));
192 return 0;
193}
194
195/*
196 * read a single page, without unlocking it.
197 */
198static int readpage_nounlock(struct file *filp, struct page *page)
199{
200 struct inode *inode = filp->f_dentry->d_inode;
201 struct ceph_inode_info *ci = ceph_inode(inode);
202 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
203 int err = 0;
204 u64 len = PAGE_CACHE_SIZE;
205
206 dout("readpage inode %p file %p page %p index %lu\n",
207 inode, filp, page, page->index);
208 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
209 page->index << PAGE_CACHE_SHIFT, &len,
210 ci->i_truncate_seq, ci->i_truncate_size,
211 &page, 1);
212 if (err == -ENOENT)
213 err = 0;
214 if (err < 0) {
215 SetPageError(page);
216 goto out;
217 } else if (err < PAGE_CACHE_SIZE) {
218 /* zero fill remainder of page */
219 zero_user_segment(page, err, PAGE_CACHE_SIZE);
220 }
221 SetPageUptodate(page);
222
223out:
224 return err < 0 ? err : 0;
225}
226
227static int ceph_readpage(struct file *filp, struct page *page)
228{
229 int r = readpage_nounlock(filp, page);
230 unlock_page(page);
231 return r;
232}
233
234/*
235 * Build a vector of contiguous pages from the provided page list.
236 */
237static struct page **page_vector_from_list(struct list_head *page_list,
238 unsigned *nr_pages)
239{
240 struct page **pages;
241 struct page *page;
242 int next_index, contig_pages = 0;
243
244 /* build page vector */
245 pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
246 if (!pages)
247 return ERR_PTR(-ENOMEM);
248
249 BUG_ON(list_empty(page_list));
250 next_index = list_entry(page_list->prev, struct page, lru)->index;
251 list_for_each_entry_reverse(page, page_list, lru) {
252 if (page->index == next_index) {
253 dout("readpages page %d %p\n", contig_pages, page);
254 pages[contig_pages] = page;
255 contig_pages++;
256 next_index++;
257 } else {
258 break;
259 }
260 }
261 *nr_pages = contig_pages;
262 return pages;
263}
264
265/*
266 * Read multiple pages. Leave pages we don't read + unlock in page_list;
267 * the caller (VM) cleans them up.
268 */
269static int ceph_readpages(struct file *file, struct address_space *mapping,
270 struct list_head *page_list, unsigned nr_pages)
271{
272 struct inode *inode = file->f_dentry->d_inode;
273 struct ceph_inode_info *ci = ceph_inode(inode);
274 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
275 int rc = 0;
276 struct page **pages;
277 struct pagevec pvec;
278 loff_t offset;
279 u64 len;
280
281 dout("readpages %p file %p nr_pages %d\n",
282 inode, file, nr_pages);
283
284 pages = page_vector_from_list(page_list, &nr_pages);
285 if (IS_ERR(pages))
286 return PTR_ERR(pages);
287
288 /* guess read extent */
289 offset = pages[0]->index << PAGE_CACHE_SHIFT;
290 len = nr_pages << PAGE_CACHE_SHIFT;
291 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
292 offset, &len,
293 ci->i_truncate_seq, ci->i_truncate_size,
294 pages, nr_pages);
295 if (rc == -ENOENT)
296 rc = 0;
297 if (rc < 0)
298 goto out;
299
300 /* set uptodate and add to lru in pagevec-sized chunks */
301 pagevec_init(&pvec, 0);
302 for (; !list_empty(page_list) && len > 0;
303 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
304 struct page *page =
305 list_entry(page_list->prev, struct page, lru);
306
307 list_del(&page->lru);
308
309 if (rc < (int)PAGE_CACHE_SIZE) {
310 /* zero (remainder of) page */
311 int s = rc < 0 ? 0 : rc;
312 zero_user_segment(page, s, PAGE_CACHE_SIZE);
313 }
314
315 if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
316 page_cache_release(page);
317 dout("readpages %p add_to_page_cache failed %p\n",
318 inode, page);
319 continue;
320 }
321 dout("readpages %p adding %p idx %lu\n", inode, page,
322 page->index);
323 flush_dcache_page(page);
324 SetPageUptodate(page);
325 unlock_page(page);
326 if (pagevec_add(&pvec, page) == 0)
327 pagevec_lru_add_file(&pvec); /* add to lru */
328 }
329 pagevec_lru_add_file(&pvec);
330 rc = 0;
331
332out:
333 kfree(pages);
334 return rc;
335}
336
337/*
338 * Get ref for the oldest snapc for an inode with dirty data... that is, the
339 * only snap context we are allowed to write back.
340 */
341static struct ceph_snap_context *get_oldest_context(struct inode *inode,
342 u64 *snap_size)
343{
344 struct ceph_inode_info *ci = ceph_inode(inode);
345 struct ceph_snap_context *snapc = NULL;
346 struct ceph_cap_snap *capsnap = NULL;
347
348 spin_lock(&inode->i_lock);
349 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
350 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
351 capsnap->context, capsnap->dirty_pages);
352 if (capsnap->dirty_pages) {
353 snapc = ceph_get_snap_context(capsnap->context);
354 if (snap_size)
355 *snap_size = capsnap->size;
356 break;
357 }
358 }
359 if (!snapc && ci->i_head_snapc) {
360 snapc = ceph_get_snap_context(ci->i_head_snapc);
361 dout(" head snapc %p has %d dirty pages\n",
362 snapc, ci->i_wrbuffer_ref_head);
363 }
364 spin_unlock(&inode->i_lock);
365 return snapc;
366}
367
368/*
369 * Write a single page, but leave the page locked.
370 *
371 * If we get a write error, set the page error bit, but still adjust the
372 * dirty page accounting (i.e., page is no longer dirty).
373 */
374static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
375{
376 struct inode *inode;
377 struct ceph_inode_info *ci;
378 struct ceph_client *client;
379 struct ceph_osd_client *osdc;
380 loff_t page_off = page->index << PAGE_CACHE_SHIFT;
381 int len = PAGE_CACHE_SIZE;
382 loff_t i_size;
383 int err = 0;
384 struct ceph_snap_context *snapc, *oldest;
385 u64 snap_size = 0;
386 long writeback_stat;
387
388 dout("writepage %p idx %lu\n", page, page->index);
389
390 if (!page->mapping || !page->mapping->host) {
391 dout("writepage %p - no mapping\n", page);
392 return -EFAULT;
393 }
394 inode = page->mapping->host;
395 ci = ceph_inode(inode);
396 client = ceph_inode_to_client(inode);
397 osdc = &client->osdc;
398
399 /* verify this is a writeable snap context */
400 snapc = (void *)page->private;
401 if (snapc == NULL) {
402 dout("writepage %p page %p not dirty?\n", inode, page);
403 goto out;
404 }
405 oldest = get_oldest_context(inode, &snap_size);
406 if (snapc->seq > oldest->seq) {
407 dout("writepage %p page %p snapc %p not writeable - noop\n",
408 inode, page, (void *)page->private);
409 /* we should only noop if called by kswapd */
410 WARN_ON((current->flags & PF_MEMALLOC) == 0);
411 ceph_put_snap_context(oldest);
412 goto out;
413 }
414 ceph_put_snap_context(oldest);
415
416 /* is this a partial page at end of file? */
417 if (snap_size)
418 i_size = snap_size;
419 else
420 i_size = i_size_read(inode);
421 if (i_size < page_off + len)
422 len = i_size - page_off;
423
424 dout("writepage %p page %p index %lu on %llu~%u\n",
425 inode, page, page->index, page_off, len);
426
427 writeback_stat = atomic_long_inc_return(&client->writeback_count);
428 if (writeback_stat >
429 CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
430 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
431
432 set_page_writeback(page);
433 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
434 &ci->i_layout, snapc,
435 page_off, len,
436 ci->i_truncate_seq, ci->i_truncate_size,
437 &inode->i_mtime,
438 &page, 1, 0, 0, true);
439 if (err < 0) {
440 dout("writepage setting page/mapping error %d %p\n", err, page);
441 SetPageError(page);
442 mapping_set_error(&inode->i_data, err);
443 if (wbc)
444 wbc->pages_skipped++;
445 } else {
446 dout("writepage cleaned page %p\n", page);
447 err = 0; /* vfs expects us to return 0 */
448 }
449 page->private = 0;
450 ClearPagePrivate(page);
451 end_page_writeback(page);
452 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
453 ceph_put_snap_context(snapc); /* page's reference */
454out:
455 return err;
456}
457
458static int ceph_writepage(struct page *page, struct writeback_control *wbc)
459{
460 int err;
461 struct inode *inode = page->mapping->host;
462 BUG_ON(!inode);
463 igrab(inode);
464 err = writepage_nounlock(page, wbc);
465 unlock_page(page);
466 iput(inode);
467 return err;
468}
469
470
471/*
472 * lame release_pages helper. release_pages() isn't exported to
473 * modules.
474 */
475static void ceph_release_pages(struct page **pages, int num)
476{
477 struct pagevec pvec;
478 int i;
479
480 pagevec_init(&pvec, 0);
481 for (i = 0; i < num; i++) {
482 if (pagevec_add(&pvec, pages[i]) == 0)
483 pagevec_release(&pvec);
484 }
485 pagevec_release(&pvec);
486}
487
488
489/*
490 * async writeback completion handler.
491 *
492 * If we get an error, set the mapping error bit, but not the individual
493 * page error bits.
494 */
495static void writepages_finish(struct ceph_osd_request *req,
496 struct ceph_msg *msg)
497{
498 struct inode *inode = req->r_inode;
499 struct ceph_osd_reply_head *replyhead;
500 struct ceph_osd_op *op;
501 struct ceph_inode_info *ci = ceph_inode(inode);
502 unsigned wrote;
503 struct page *page;
504 int i;
505 struct ceph_snap_context *snapc = req->r_snapc;
506 struct address_space *mapping = inode->i_mapping;
507 struct writeback_control *wbc = req->r_wbc;
508 __s32 rc = -EIO;
509 u64 bytes = 0;
510 struct ceph_client *client = ceph_inode_to_client(inode);
511 long writeback_stat;
512 unsigned issued = __ceph_caps_issued(ci, NULL);
513
514 /* parse reply */
515 replyhead = msg->front.iov_base;
516 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
517 op = (void *)(replyhead + 1);
518 rc = le32_to_cpu(replyhead->result);
519 bytes = le64_to_cpu(op->extent.length);
520
521 if (rc >= 0) {
522 /*
523 * Assume we wrote the pages we originally sent. The
524 * osd might reply with fewer pages if our writeback
525 * raced with a truncation and was adjusted at the osd,
526 * so don't believe the reply.
527 */
528 wrote = req->r_num_pages;
529 } else {
530 wrote = 0;
531 mapping_set_error(mapping, rc);
532 }
533 dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
534 inode, rc, bytes, wrote);
535
536 /* clean all pages */
537 for (i = 0; i < req->r_num_pages; i++) {
538 page = req->r_pages[i];
539 BUG_ON(!page);
540 WARN_ON(!PageUptodate(page));
541
542 writeback_stat =
543 atomic_long_dec_return(&client->writeback_count);
544 if (writeback_stat <
545 CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
546 clear_bdi_congested(&client->backing_dev_info,
547 BLK_RW_ASYNC);
548
549 if (i >= wrote) {
550 dout("inode %p skipping page %p\n", inode, page);
551 wbc->pages_skipped++;
552 }
553 ceph_put_snap_context((void *)page->private);
554 page->private = 0;
555 ClearPagePrivate(page);
556 dout("unlocking %d %p\n", i, page);
557 end_page_writeback(page);
558
559 /*
560 * We lost the cache cap, need to truncate the page before
561 * it is unlocked, otherwise we'd truncate it later in the
562 * page truncation thread, possibly losing some data that
563 * raced its way in
564 */
565 if ((issued & CEPH_CAP_FILE_CACHE) == 0)
566 generic_error_remove_page(inode->i_mapping, page);
567
568 unlock_page(page);
569 }
570 dout("%p wrote+cleaned %d pages\n", inode, wrote);
571 ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
572
573 ceph_release_pages(req->r_pages, req->r_num_pages);
574 if (req->r_pages_from_pool)
575 mempool_free(req->r_pages,
576 ceph_client(inode->i_sb)->wb_pagevec_pool);
577 else
578 kfree(req->r_pages);
579 ceph_osdc_put_request(req);
580}
581
582/*
583 * allocate a page vec, either directly, or if necessary, via a the
584 * mempool. we avoid the mempool if we can because req->r_num_pages
585 * may be less than the maximum write size.
586 */
587static void alloc_page_vec(struct ceph_client *client,
588 struct ceph_osd_request *req)
589{
590 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
591 GFP_NOFS);
592 if (!req->r_pages) {
593 req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
594 req->r_pages_from_pool = 1;
595 WARN_ON(!req->r_pages);
596 }
597}
598
599/*
600 * initiate async writeback
601 */
602static int ceph_writepages_start(struct address_space *mapping,
603 struct writeback_control *wbc)
604{
605 struct inode *inode = mapping->host;
606 struct backing_dev_info *bdi = mapping->backing_dev_info;
607 struct ceph_inode_info *ci = ceph_inode(inode);
608 struct ceph_client *client;
609 pgoff_t index, start, end;
610 int range_whole = 0;
611 int should_loop = 1;
612 pgoff_t max_pages = 0, max_pages_ever = 0;
613 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
614 struct pagevec pvec;
615 int done = 0;
616 int rc = 0;
617 unsigned wsize = 1 << inode->i_blkbits;
618 struct ceph_osd_request *req = NULL;
619 int do_sync;
620 u64 snap_size = 0;
621
622 /*
623 * Include a 'sync' in the OSD request if this is a data
624 * integrity write (e.g., O_SYNC write or fsync()), or if our
625 * cap is being revoked.
626 */
627 do_sync = wbc->sync_mode == WB_SYNC_ALL;
628 if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
629 do_sync = 1;
630 dout("writepages_start %p dosync=%d (mode=%s)\n",
631 inode, do_sync,
632 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
633 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
634
635 client = ceph_inode_to_client(inode);
636 if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
637 pr_warning("writepage_start %p on forced umount\n", inode);
638 return -EIO; /* we're in a forced umount, don't write! */
639 }
640 if (client->mount_args->wsize && client->mount_args->wsize < wsize)
641 wsize = client->mount_args->wsize;
642 if (wsize < PAGE_CACHE_SIZE)
643 wsize = PAGE_CACHE_SIZE;
644 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
645
646 pagevec_init(&pvec, 0);
647
648 /* ?? */
649 if (wbc->nonblocking && bdi_write_congested(bdi)) {
650 dout(" writepages congested\n");
651 wbc->encountered_congestion = 1;
652 goto out_final;
653 }
654
655 /* where to start/end? */
656 if (wbc->range_cyclic) {
657 start = mapping->writeback_index; /* Start from prev offset */
658 end = -1;
659 dout(" cyclic, start at %lu\n", start);
660 } else {
661 start = wbc->range_start >> PAGE_CACHE_SHIFT;
662 end = wbc->range_end >> PAGE_CACHE_SHIFT;
663 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
664 range_whole = 1;
665 should_loop = 0;
666 dout(" not cyclic, %lu to %lu\n", start, end);
667 }
668 index = start;
669
670retry:
671 /* find oldest snap context with dirty data */
672 ceph_put_snap_context(snapc);
673 snapc = get_oldest_context(inode, &snap_size);
674 if (!snapc) {
675 /* hmm, why does writepages get called when there
676 is no dirty data? */
677 dout(" no snap context with dirty data?\n");
678 goto out;
679 }
680 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
681 snapc, snapc->seq, snapc->num_snaps);
682 if (last_snapc && snapc != last_snapc) {
683 /* if we switched to a newer snapc, restart our scan at the
684 * start of the original file range. */
685 dout(" snapc differs from last pass, restarting at %lu\n",
686 index);
687 index = start;
688 }
689 last_snapc = snapc;
690
691 while (!done && index <= end) {
692 unsigned i;
693 int first;
694 pgoff_t next;
695 int pvec_pages, locked_pages;
696 struct page *page;
697 int want;
698 u64 offset, len;
699 struct ceph_osd_request_head *reqhead;
700 struct ceph_osd_op *op;
701 long writeback_stat;
702
703 next = 0;
704 locked_pages = 0;
705 max_pages = max_pages_ever;
706
707get_more_pages:
708 first = -1;
709 want = min(end - index,
710 min((pgoff_t)PAGEVEC_SIZE,
711 max_pages - (pgoff_t)locked_pages) - 1)
712 + 1;
713 pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
714 PAGECACHE_TAG_DIRTY,
715 want);
716 dout("pagevec_lookup_tag got %d\n", pvec_pages);
717 if (!pvec_pages && !locked_pages)
718 break;
719 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
720 page = pvec.pages[i];
721 dout("? %p idx %lu\n", page, page->index);
722 if (locked_pages == 0)
723 lock_page(page); /* first page */
724 else if (!trylock_page(page))
725 break;
726
727 /* only dirty pages, or our accounting breaks */
728 if (unlikely(!PageDirty(page)) ||
729 unlikely(page->mapping != mapping)) {
730 dout("!dirty or !mapping %p\n", page);
731 unlock_page(page);
732 break;
733 }
734 if (!wbc->range_cyclic && page->index > end) {
735 dout("end of range %p\n", page);
736 done = 1;
737 unlock_page(page);
738 break;
739 }
740 if (next && (page->index != next)) {
741 dout("not consecutive %p\n", page);
742 unlock_page(page);
743 break;
744 }
745 if (wbc->sync_mode != WB_SYNC_NONE) {
746 dout("waiting on writeback %p\n", page);
747 wait_on_page_writeback(page);
748 }
749 if ((snap_size && page_offset(page) > snap_size) ||
750 (!snap_size &&
751 page_offset(page) > i_size_read(inode))) {
752 dout("%p page eof %llu\n", page, snap_size ?
753 snap_size : i_size_read(inode));
754 done = 1;
755 unlock_page(page);
756 break;
757 }
758 if (PageWriteback(page)) {
759 dout("%p under writeback\n", page);
760 unlock_page(page);
761 break;
762 }
763
764 /* only if matching snap context */
765 pgsnapc = (void *)page->private;
766 if (pgsnapc->seq > snapc->seq) {
767 dout("page snapc %p %lld > oldest %p %lld\n",
768 pgsnapc, pgsnapc->seq, snapc, snapc->seq);
769 unlock_page(page);
770 if (!locked_pages)
771 continue; /* keep looking for snap */
772 break;
773 }
774
775 if (!clear_page_dirty_for_io(page)) {
776 dout("%p !clear_page_dirty_for_io\n", page);
777 unlock_page(page);
778 break;
779 }
780
781 /* ok */
782 if (locked_pages == 0) {
783 /* prepare async write request */
784 offset = page->index << PAGE_CACHE_SHIFT;
785 len = wsize;
786 req = ceph_osdc_new_request(&client->osdc,
787 &ci->i_layout,
788 ceph_vino(inode),
789 offset, &len,
790 CEPH_OSD_OP_WRITE,
791 CEPH_OSD_FLAG_WRITE |
792 CEPH_OSD_FLAG_ONDISK,
793 snapc, do_sync,
794 ci->i_truncate_seq,
795 ci->i_truncate_size,
796 &inode->i_mtime, true, 1);
797 max_pages = req->r_num_pages;
798
799 alloc_page_vec(client, req);
800 req->r_callback = writepages_finish;
801 req->r_inode = inode;
802 req->r_wbc = wbc;
803 }
804
805 /* note position of first page in pvec */
806 if (first < 0)
807 first = i;
808 dout("%p will write page %p idx %lu\n",
809 inode, page, page->index);
810
811 writeback_stat = atomic_long_inc_return(&client->writeback_count);
812 if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
813 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
814 }
815
816 set_page_writeback(page);
817 req->r_pages[locked_pages] = page;
818 locked_pages++;
819 next = page->index + 1;
820 }
821
822 /* did we get anything? */
823 if (!locked_pages)
824 goto release_pvec_pages;
825 if (i) {
826 int j;
827 BUG_ON(!locked_pages || first < 0);
828
829 if (pvec_pages && i == pvec_pages &&
830 locked_pages < max_pages) {
831 dout("reached end pvec, trying for more\n");
832 pagevec_reinit(&pvec);
833 goto get_more_pages;
834 }
835
836 /* shift unused pages over in the pvec... we
837 * will need to release them below. */
838 for (j = i; j < pvec_pages; j++) {
839 dout(" pvec leftover page %p\n",
840 pvec.pages[j]);
841 pvec.pages[j-i+first] = pvec.pages[j];
842 }
843 pvec.nr -= i-first;
844 }
845
846 /* submit the write */
847 offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
848 len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
849 (u64)locked_pages << PAGE_CACHE_SHIFT);
850 dout("writepages got %d pages at %llu~%llu\n",
851 locked_pages, offset, len);
852
853 /* revise final length, page count */
854 req->r_num_pages = locked_pages;
855 reqhead = req->r_request->front.iov_base;
856 op = (void *)(reqhead + 1);
857 op->extent.length = cpu_to_le64(len);
858 op->payload_len = cpu_to_le32(len);
859 req->r_request->hdr.data_len = cpu_to_le32(len);
860
861 ceph_osdc_start_request(&client->osdc, req, true);
862 req = NULL;
863
864 /* continue? */
865 index = next;
866 wbc->nr_to_write -= locked_pages;
867 if (wbc->nr_to_write <= 0)
868 done = 1;
869
870release_pvec_pages:
871 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
872 pvec.nr ? pvec.pages[0] : NULL);
873 pagevec_release(&pvec);
874
875 if (locked_pages && !done)
876 goto retry;
877 }
878
879 if (should_loop && !done) {
880 /* more to do; loop back to beginning of file */
881 dout("writepages looping back to beginning of file\n");
882 should_loop = 0;
883 index = 0;
884 goto retry;
885 }
886
887 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
888 mapping->writeback_index = index;
889
890out:
891 if (req)
892 ceph_osdc_put_request(req);
893 if (rc > 0)
894 rc = 0; /* vfs expects us to return 0 */
895 ceph_put_snap_context(snapc);
896 dout("writepages done, rc = %d\n", rc);
897out_final:
898 return rc;
899}
900
901
902
903/*
904 * See if a given @snapc is either writeable, or already written.
905 */
906static int context_is_writeable_or_written(struct inode *inode,
907 struct ceph_snap_context *snapc)
908{
909 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
910 int ret = !oldest || snapc->seq <= oldest->seq;
911
912 ceph_put_snap_context(oldest);
913 return ret;
914}
915
916/*
917 * We are only allowed to write into/dirty the page if the page is
918 * clean, or already dirty within the same snap context.
919 *
920 * called with page locked.
921 * return success with page locked,
922 * or any failure (incl -EAGAIN) with page unlocked.
923 */
924static int ceph_update_writeable_page(struct file *file,
925 loff_t pos, unsigned len,
926 struct page *page)
927{
928 struct inode *inode = file->f_dentry->d_inode;
929 struct ceph_inode_info *ci = ceph_inode(inode);
930 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
931 loff_t page_off = pos & PAGE_CACHE_MASK;
932 int pos_in_page = pos & ~PAGE_CACHE_MASK;
933 int end_in_page = pos_in_page + len;
934 loff_t i_size;
935 int r;
936 struct ceph_snap_context *snapc, *oldest;
937
938retry_locked:
939 /* writepages currently holds page lock, but if we change that later, */
940 wait_on_page_writeback(page);
941
942 /* check snap context */
943 BUG_ON(!ci->i_snap_realm);
944 down_read(&mdsc->snap_rwsem);
945 BUG_ON(!ci->i_snap_realm->cached_context);
946 snapc = (void *)page->private;
947 if (snapc && snapc != ci->i_head_snapc) {
948 /*
949 * this page is already dirty in another (older) snap
950 * context! is it writeable now?
951 */
952 oldest = get_oldest_context(inode, NULL);
953 up_read(&mdsc->snap_rwsem);
954
955 if (snapc->seq > oldest->seq) {
956 ceph_put_snap_context(oldest);
957 dout(" page %p snapc %p not current or oldest\n",
958 page, snapc);
959 /*
960 * queue for writeback, and wait for snapc to
961 * be writeable or written
962 */
963 snapc = ceph_get_snap_context(snapc);
964 unlock_page(page);
965 ceph_queue_writeback(inode);
966 r = wait_event_interruptible(ci->i_cap_wq,
967 context_is_writeable_or_written(inode, snapc));
968 ceph_put_snap_context(snapc);
969 if (r == -ERESTARTSYS)
970 return r;
971 return -EAGAIN;
972 }
973 ceph_put_snap_context(oldest);
974
975 /* yay, writeable, do it now (without dropping page lock) */
976 dout(" page %p snapc %p not current, but oldest\n",
977 page, snapc);
978 if (!clear_page_dirty_for_io(page))
979 goto retry_locked;
980 r = writepage_nounlock(page, NULL);
981 if (r < 0)
982 goto fail_nosnap;
983 goto retry_locked;
984 }
985
986 if (PageUptodate(page)) {
987 dout(" page %p already uptodate\n", page);
988 return 0;
989 }
990
991 /* full page? */
992 if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
993 return 0;
994
995 /* past end of file? */
996 i_size = inode->i_size; /* caller holds i_mutex */
997
998 if (i_size + len > inode->i_sb->s_maxbytes) {
999 /* file is too big */
1000 r = -EINVAL;
1001 goto fail;
1002 }
1003
1004 if (page_off >= i_size ||
1005 (pos_in_page == 0 && (pos+len) >= i_size &&
1006 end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
1007 dout(" zeroing %p 0 - %d and %d - %d\n",
1008 page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
1009 zero_user_segments(page,
1010 0, pos_in_page,
1011 end_in_page, PAGE_CACHE_SIZE);
1012 return 0;
1013 }
1014
1015 /* we need to read it. */
1016 up_read(&mdsc->snap_rwsem);
1017 r = readpage_nounlock(file, page);
1018 if (r < 0)
1019 goto fail_nosnap;
1020 goto retry_locked;
1021
1022fail:
1023 up_read(&mdsc->snap_rwsem);
1024fail_nosnap:
1025 unlock_page(page);
1026 return r;
1027}
1028
1029/*
1030 * We are only allowed to write into/dirty the page if the page is
1031 * clean, or already dirty within the same snap context.
1032 */
1033static int ceph_write_begin(struct file *file, struct address_space *mapping,
1034 loff_t pos, unsigned len, unsigned flags,
1035 struct page **pagep, void **fsdata)
1036{
1037 struct inode *inode = file->f_dentry->d_inode;
1038 struct page *page;
1039 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1040 int r;
1041
1042 do {
1043 /* get a page */
1044 page = grab_cache_page_write_begin(mapping, index, 0);
1045 if (!page)
1046 return -ENOMEM;
1047 *pagep = page;
1048
1049 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1050 inode, page, (int)pos, (int)len);
1051
1052 r = ceph_update_writeable_page(file, pos, len, page);
1053 } while (r == -EAGAIN);
1054
1055 return r;
1056}
1057
1058/*
1059 * we don't do anything in here that simple_write_end doesn't do
1060 * except adjust dirty page accounting and drop read lock on
1061 * mdsc->snap_rwsem.
1062 */
1063static int ceph_write_end(struct file *file, struct address_space *mapping,
1064 loff_t pos, unsigned len, unsigned copied,
1065 struct page *page, void *fsdata)
1066{
1067 struct inode *inode = file->f_dentry->d_inode;
1068 struct ceph_client *client = ceph_inode_to_client(inode);
1069 struct ceph_mds_client *mdsc = &client->mdsc;
1070 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1071 int check_cap = 0;
1072
1073 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
1074 inode, page, (int)pos, (int)copied, (int)len);
1075
1076 /* zero the stale part of the page if we did a short copy */
1077 if (copied < len)
1078 zero_user_segment(page, from+copied, len);
1079
1080 /* did file size increase? */
1081 /* (no need for i_size_read(); we caller holds i_mutex */
1082 if (pos+copied > inode->i_size)
1083 check_cap = ceph_inode_set_size(inode, pos+copied);
1084
1085 if (!PageUptodate(page))
1086 SetPageUptodate(page);
1087
1088 set_page_dirty(page);
1089
1090 unlock_page(page);
1091 up_read(&mdsc->snap_rwsem);
1092 page_cache_release(page);
1093
1094 if (check_cap)
1095 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
1096
1097 return copied;
1098}
1099
1100/*
1101 * we set .direct_IO to indicate direct io is supported, but since we
1102 * intercept O_DIRECT reads and writes early, this function should
1103 * never get called.
1104 */
1105static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
1106 const struct iovec *iov,
1107 loff_t pos, unsigned long nr_segs)
1108{
1109 WARN_ON(1);
1110 return -EINVAL;
1111}
1112
1113const struct address_space_operations ceph_aops = {
1114 .readpage = ceph_readpage,
1115 .readpages = ceph_readpages,
1116 .writepage = ceph_writepage,
1117 .writepages = ceph_writepages_start,
1118 .write_begin = ceph_write_begin,
1119 .write_end = ceph_write_end,
1120 .set_page_dirty = ceph_set_page_dirty,
1121 .invalidatepage = ceph_invalidatepage,
1122 .releasepage = ceph_releasepage,
1123 .direct_IO = ceph_direct_io,
1124};
1125
1126
1127/*
1128 * vm ops
1129 */
1130
1131/*
1132 * Reuse write_begin here for simplicity.
1133 */
1134static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1135{
1136 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1137 struct page *page = vmf->page;
1138 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1139 loff_t off = page->index << PAGE_CACHE_SHIFT;
1140 loff_t size, len;
1141 int ret;
1142
1143 size = i_size_read(inode);
1144 if (off + PAGE_CACHE_SIZE <= size)
1145 len = PAGE_CACHE_SIZE;
1146 else
1147 len = size & ~PAGE_CACHE_MASK;
1148
1149 dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
1150 off, len, page, page->index);
1151
1152 lock_page(page);
1153
1154 ret = VM_FAULT_NOPAGE;
1155 if ((off > size) ||
1156 (page->mapping != inode->i_mapping))
1157 goto out;
1158
1159 ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
1160 if (ret == 0) {
1161 /* success. we'll keep the page locked. */
1162 set_page_dirty(page);
1163 up_read(&mdsc->snap_rwsem);
1164 ret = VM_FAULT_LOCKED;
1165 } else {
1166 if (ret == -ENOMEM)
1167 ret = VM_FAULT_OOM;
1168 else
1169 ret = VM_FAULT_SIGBUS;
1170 }
1171out:
1172 dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
1173 if (ret != VM_FAULT_LOCKED)
1174 unlock_page(page);
1175 return ret;
1176}
1177
1178static struct vm_operations_struct ceph_vmops = {
1179 .fault = filemap_fault,
1180 .page_mkwrite = ceph_page_mkwrite,
1181};
1182
1183int ceph_mmap(struct file *file, struct vm_area_struct *vma)
1184{
1185 struct address_space *mapping = file->f_mapping;
1186
1187 if (!mapping->a_ops->readpage)
1188 return -ENOEXEC;
1189 file_accessed(file);
1190 vma->vm_ops = &ceph_vmops;
1191 vma->vm_flags |= VM_CAN_NONLINEAR;
1192 return 0;
1193}
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
new file mode 100644
index 000000000000..67b2c030924b
--- /dev/null
+++ b/fs/ceph/armor.c
@@ -0,0 +1,99 @@
1
2#include <linux/errno.h>
3
4/*
5 * base64 encode/decode.
6 */
7
8const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
9
10static int encode_bits(int c)
11{
12 return pem_key[c];
13}
14
15static int decode_bits(char c)
16{
17 if (c >= 'A' && c <= 'Z')
18 return c - 'A';
19 if (c >= 'a' && c <= 'z')
20 return c - 'a' + 26;
21 if (c >= '0' && c <= '9')
22 return c - '0' + 52;
23 if (c == '+')
24 return 62;
25 if (c == '/')
26 return 63;
27 if (c == '=')
28 return 0; /* just non-negative, please */
29 return -EINVAL;
30}
31
32int ceph_armor(char *dst, const char *src, const char *end)
33{
34 int olen = 0;
35 int line = 0;
36
37 while (src < end) {
38 unsigned char a, b, c;
39
40 a = *src++;
41 *dst++ = encode_bits(a >> 2);
42 if (src < end) {
43 b = *src++;
44 *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
45 if (src < end) {
46 c = *src++;
47 *dst++ = encode_bits(((b & 15) << 2) |
48 (c >> 6));
49 *dst++ = encode_bits(c & 63);
50 } else {
51 *dst++ = encode_bits((b & 15) << 2);
52 *dst++ = '=';
53 }
54 } else {
55 *dst++ = encode_bits(((a & 3) << 4));
56 *dst++ = '=';
57 *dst++ = '=';
58 }
59 olen += 4;
60 line += 4;
61 if (line == 64) {
62 line = 0;
63 *(dst++) = '\n';
64 olen++;
65 }
66 }
67 return olen;
68}
69
70int ceph_unarmor(char *dst, const char *src, const char *end)
71{
72 int olen = 0;
73
74 while (src < end) {
75 int a, b, c, d;
76
77 if (src < end && src[0] == '\n')
78 src++;
79 if (src + 4 > end)
80 return -EINVAL;
81 a = decode_bits(src[0]);
82 b = decode_bits(src[1]);
83 c = decode_bits(src[2]);
84 d = decode_bits(src[3]);
85 if (a < 0 || b < 0 || c < 0 || d < 0)
86 return -EINVAL;
87
88 *dst++ = (a << 2) | (b >> 4);
89 if (src[2] == '=')
90 return olen + 1;
91 *dst++ = ((b & 15) << 4) | (c >> 2);
92 if (src[3] == '=')
93 return olen + 2;
94 *dst++ = ((c & 3) << 6) | d;
95 olen += 3;
96 src += 4;
97 }
98 return olen;
99}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
new file mode 100644
index 000000000000..f6394b94b866
--- /dev/null
+++ b/fs/ceph/auth.c
@@ -0,0 +1,258 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/slab.h>
5#include <linux/err.h>
6
7#include "types.h"
8#include "auth_none.h"
9#include "auth_x.h"
10#include "decode.h"
11#include "super.h"
12
13#include "messenger.h"
14
15/*
16 * get protocol handler
17 */
18static u32 supported_protocols[] = {
19 CEPH_AUTH_NONE,
20 CEPH_AUTH_CEPHX
21};
22
23int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
24{
25 switch (protocol) {
26 case CEPH_AUTH_NONE:
27 return ceph_auth_none_init(ac);
28 case CEPH_AUTH_CEPHX:
29 return ceph_x_init(ac);
30 default:
31 return -ENOENT;
32 }
33}
34
35/*
36 * setup, teardown.
37 */
38struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
39{
40 struct ceph_auth_client *ac;
41 int ret;
42
43 dout("auth_init name '%s' secret '%s'\n", name, secret);
44
45 ret = -ENOMEM;
46 ac = kzalloc(sizeof(*ac), GFP_NOFS);
47 if (!ac)
48 goto out;
49
50 ac->negotiating = true;
51 if (name)
52 ac->name = name;
53 else
54 ac->name = CEPH_AUTH_NAME_DEFAULT;
55 dout("auth_init name %s secret %s\n", ac->name, secret);
56 ac->secret = secret;
57 return ac;
58
59out:
60 return ERR_PTR(ret);
61}
62
63void ceph_auth_destroy(struct ceph_auth_client *ac)
64{
65 dout("auth_destroy %p\n", ac);
66 if (ac->ops)
67 ac->ops->destroy(ac);
68 kfree(ac);
69}
70
71/*
72 * Reset occurs when reconnecting to the monitor.
73 */
74void ceph_auth_reset(struct ceph_auth_client *ac)
75{
76 dout("auth_reset %p\n", ac);
77 if (ac->ops && !ac->negotiating)
78 ac->ops->reset(ac);
79 ac->negotiating = true;
80}
81
82int ceph_entity_name_encode(const char *name, void **p, void *end)
83{
84 int len = strlen(name);
85
86 if (*p + 2*sizeof(u32) + len > end)
87 return -ERANGE;
88 ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
89 ceph_encode_32(p, len);
90 ceph_encode_copy(p, name, len);
91 return 0;
92}
93
94/*
95 * Initiate protocol negotiation with monitor. Include entity name
96 * and list supported protocols.
97 */
98int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
99{
100 struct ceph_mon_request_header *monhdr = buf;
101 void *p = monhdr + 1, *end = buf + len, *lenp;
102 int i, num;
103 int ret;
104
105 dout("auth_build_hello\n");
106 monhdr->have_version = 0;
107 monhdr->session_mon = cpu_to_le16(-1);
108 monhdr->session_mon_tid = 0;
109
110 ceph_encode_32(&p, 0); /* no protocol, yet */
111
112 lenp = p;
113 p += sizeof(u32);
114
115 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
116 ceph_encode_8(&p, 1);
117 num = ARRAY_SIZE(supported_protocols);
118 ceph_encode_32(&p, num);
119 ceph_decode_need(&p, end, num * sizeof(u32), bad);
120 for (i = 0; i < num; i++)
121 ceph_encode_32(&p, supported_protocols[i]);
122
123 ret = ceph_entity_name_encode(ac->name, &p, end);
124 if (ret < 0)
125 return ret;
126 ceph_decode_need(&p, end, sizeof(u64), bad);
127 ceph_encode_64(&p, ac->global_id);
128
129 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
130 return p - buf;
131
132bad:
133 return -ERANGE;
134}
135
136int ceph_build_auth_request(struct ceph_auth_client *ac,
137 void *msg_buf, size_t msg_len)
138{
139 struct ceph_mon_request_header *monhdr = msg_buf;
140 void *p = monhdr + 1;
141 void *end = msg_buf + msg_len;
142 int ret;
143
144 monhdr->have_version = 0;
145 monhdr->session_mon = cpu_to_le16(-1);
146 monhdr->session_mon_tid = 0;
147
148 ceph_encode_32(&p, ac->protocol);
149
150 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
151 if (ret < 0) {
152 pr_err("error %d building request\n", ret);
153 return ret;
154 }
155 dout(" built request %d bytes\n", ret);
156 ceph_encode_32(&p, ret);
157 return p + ret - msg_buf;
158}
159
160/*
161 * Handle auth message from monitor.
162 */
163int ceph_handle_auth_reply(struct ceph_auth_client *ac,
164 void *buf, size_t len,
165 void *reply_buf, size_t reply_len)
166{
167 void *p = buf;
168 void *end = buf + len;
169 int protocol;
170 s32 result;
171 u64 global_id;
172 void *payload, *payload_end;
173 int payload_len;
174 char *result_msg;
175 int result_msg_len;
176 int ret = -EINVAL;
177
178 dout("handle_auth_reply %p %p\n", p, end);
179 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
180 protocol = ceph_decode_32(&p);
181 result = ceph_decode_32(&p);
182 global_id = ceph_decode_64(&p);
183 payload_len = ceph_decode_32(&p);
184 payload = p;
185 p += payload_len;
186 ceph_decode_need(&p, end, sizeof(u32), bad);
187 result_msg_len = ceph_decode_32(&p);
188 result_msg = p;
189 p += result_msg_len;
190 if (p != end)
191 goto bad;
192
193 dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
194 result_msg, global_id, payload_len);
195
196 payload_end = payload + payload_len;
197
198 if (global_id && ac->global_id != global_id) {
199 dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
200 ac->global_id = global_id;
201 }
202
203 if (ac->negotiating) {
204 /* server does not support our protocols? */
205 if (!protocol && result < 0) {
206 ret = result;
207 goto out;
208 }
209 /* set up (new) protocol handler? */
210 if (ac->protocol && ac->protocol != protocol) {
211 ac->ops->destroy(ac);
212 ac->protocol = 0;
213 ac->ops = NULL;
214 }
215 if (ac->protocol != protocol) {
216 ret = ceph_auth_init_protocol(ac, protocol);
217 if (ret) {
218 pr_err("error %d on auth protocol %d init\n",
219 ret, protocol);
220 goto out;
221 }
222 }
223
224 ac->negotiating = false;
225 }
226
227 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
228 if (ret == -EAGAIN) {
229 return ceph_build_auth_request(ac, reply_buf, reply_len);
230 } else if (ret) {
231 pr_err("authentication error %d\n", ret);
232 return ret;
233 }
234 return 0;
235
236bad:
237 pr_err("failed to decode auth msg\n");
238out:
239 return ret;
240}
241
242int ceph_build_auth(struct ceph_auth_client *ac,
243 void *msg_buf, size_t msg_len)
244{
245 if (!ac->protocol)
246 return ceph_auth_build_hello(ac, msg_buf, msg_len);
247 BUG_ON(!ac->ops);
248 if (!ac->ops->is_authenticated(ac))
249 return ceph_build_auth_request(ac, msg_buf, msg_len);
250 return 0;
251}
252
253int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
254{
255 if (!ac->ops)
256 return 0;
257 return ac->ops->is_authenticated(ac);
258}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
new file mode 100644
index 000000000000..ca4f57cfb267
--- /dev/null
+++ b/fs/ceph/auth.h
@@ -0,0 +1,84 @@
1#ifndef _FS_CEPH_AUTH_H
2#define _FS_CEPH_AUTH_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * Abstract interface for communicating with the authenticate module.
9 * There is some handshake that takes place between us and the monitor
10 * to acquire the necessary keys. These are used to generate an
11 * 'authorizer' that we use when connecting to a service (mds, osd).
12 */
13
14struct ceph_auth_client;
15struct ceph_authorizer;
16
17struct ceph_auth_client_ops {
18 /*
19 * true if we are authenticated and can connect to
20 * services.
21 */
22 int (*is_authenticated)(struct ceph_auth_client *ac);
23
24 /*
25 * build requests and process replies during monitor
26 * handshake. if handle_reply returns -EAGAIN, we build
27 * another request.
28 */
29 int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
30 int (*handle_reply)(struct ceph_auth_client *ac, int result,
31 void *buf, void *end);
32
33 /*
34 * Create authorizer for connecting to a service, and verify
35 * the response to authenticate the service.
36 */
37 int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
38 struct ceph_authorizer **a,
39 void **buf, size_t *len,
40 void **reply_buf, size_t *reply_len);
41 int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
42 struct ceph_authorizer *a, size_t len);
43 void (*destroy_authorizer)(struct ceph_auth_client *ac,
44 struct ceph_authorizer *a);
45 void (*invalidate_authorizer)(struct ceph_auth_client *ac,
46 int peer_type);
47
48 /* reset when we (re)connect to a monitor */
49 void (*reset)(struct ceph_auth_client *ac);
50
51 void (*destroy)(struct ceph_auth_client *ac);
52};
53
54struct ceph_auth_client {
55 u32 protocol; /* CEPH_AUTH_* */
56 void *private; /* for use by protocol implementation */
57 const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
58
59 bool negotiating; /* true if negotiating protocol */
60 const char *name; /* entity name */
61 u64 global_id; /* our unique id in system */
62 const char *secret; /* our secret key */
63 unsigned want_keys; /* which services we want */
64};
65
66extern struct ceph_auth_client *ceph_auth_init(const char *name,
67 const char *secret);
68extern void ceph_auth_destroy(struct ceph_auth_client *ac);
69
70extern void ceph_auth_reset(struct ceph_auth_client *ac);
71
72extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
73 void *buf, size_t len);
74extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
75 void *buf, size_t len,
76 void *reply_buf, size_t reply_len);
77extern int ceph_entity_name_encode(const char *name, void **p, void *end);
78
79extern int ceph_build_auth(struct ceph_auth_client *ac,
80 void *msg_buf, size_t msg_len);
81
82extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
83
84#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644
index 000000000000..8cd9e3af07f7
--- /dev/null
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,122 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_none.h"
10#include "auth.h"
11#include "decode.h"
12
13static void reset(struct ceph_auth_client *ac)
14{
15 struct ceph_auth_none_info *xi = ac->private;
16
17 xi->starting = true;
18 xi->built_authorizer = false;
19}
20
21static void destroy(struct ceph_auth_client *ac)
22{
23 kfree(ac->private);
24 ac->private = NULL;
25}
26
27static int is_authenticated(struct ceph_auth_client *ac)
28{
29 struct ceph_auth_none_info *xi = ac->private;
30
31 return !xi->starting;
32}
33
34/*
35 * the generic auth code decode the global_id, and we carry no actual
36 * authenticate state, so nothing happens here.
37 */
38static int handle_reply(struct ceph_auth_client *ac, int result,
39 void *buf, void *end)
40{
41 struct ceph_auth_none_info *xi = ac->private;
42
43 xi->starting = false;
44 return result;
45}
46
47/*
48 * build an 'authorizer' with our entity_name and global_id. we can
49 * reuse a single static copy since it is identical for all services
50 * we connect to.
51 */
52static int ceph_auth_none_create_authorizer(
53 struct ceph_auth_client *ac, int peer_type,
54 struct ceph_authorizer **a,
55 void **buf, size_t *len,
56 void **reply_buf, size_t *reply_len)
57{
58 struct ceph_auth_none_info *ai = ac->private;
59 struct ceph_none_authorizer *au = &ai->au;
60 void *p, *end;
61 int ret;
62
63 if (!ai->built_authorizer) {
64 p = au->buf;
65 end = p + sizeof(au->buf);
66 ceph_encode_8(&p, 1);
67 ret = ceph_entity_name_encode(ac->name, &p, end - 8);
68 if (ret < 0)
69 goto bad;
70 ceph_decode_need(&p, end, sizeof(u64), bad2);
71 ceph_encode_64(&p, ac->global_id);
72 au->buf_len = p - (void *)au->buf;
73 ai->built_authorizer = true;
74 dout("built authorizer len %d\n", au->buf_len);
75 }
76
77 *a = (struct ceph_authorizer *)au;
78 *buf = au->buf;
79 *len = au->buf_len;
80 *reply_buf = au->reply_buf;
81 *reply_len = sizeof(au->reply_buf);
82 return 0;
83
84bad2:
85 ret = -ERANGE;
86bad:
87 return ret;
88}
89
90static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
91 struct ceph_authorizer *a)
92{
93 /* nothing to do */
94}
95
96static const struct ceph_auth_client_ops ceph_auth_none_ops = {
97 .reset = reset,
98 .destroy = destroy,
99 .is_authenticated = is_authenticated,
100 .handle_reply = handle_reply,
101 .create_authorizer = ceph_auth_none_create_authorizer,
102 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
103};
104
105int ceph_auth_none_init(struct ceph_auth_client *ac)
106{
107 struct ceph_auth_none_info *xi;
108
109 dout("ceph_auth_none_init %p\n", ac);
110 xi = kzalloc(sizeof(*xi), GFP_NOFS);
111 if (!xi)
112 return -ENOMEM;
113
114 xi->starting = true;
115 xi->built_authorizer = false;
116
117 ac->protocol = CEPH_AUTH_NONE;
118 ac->private = xi;
119 ac->ops = &ceph_auth_none_ops;
120 return 0;
121}
122
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644
index 000000000000..56c05533a31c
--- /dev/null
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,28 @@
1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H
3
4#include "auth.h"
5
6/*
7 * null security mode.
8 *
9 * we use a single static authorizer that simply encodes our entity name
10 * and global id.
11 */
12
13struct ceph_none_authorizer {
14 char buf[128];
15 int buf_len;
16 char reply_buf[0];
17};
18
19struct ceph_auth_none_info {
20 bool starting;
21 bool built_authorizer;
22 struct ceph_none_authorizer au; /* we only need one; it's static */
23};
24
25extern int ceph_auth_none_init(struct ceph_auth_client *ac);
26
27#endif
28
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
new file mode 100644
index 000000000000..d9001a4dc8cc
--- /dev/null
+++ b/fs/ceph/auth_x.c
@@ -0,0 +1,680 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_x.h"
10#include "auth_x_protocol.h"
11#include "crypto.h"
12#include "auth.h"
13#include "decode.h"
14
15struct kmem_cache *ceph_x_ticketbuf_cachep;
16
17#define TEMP_TICKET_BUF_LEN 256
18
19static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
20
21static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
22{
23 struct ceph_x_info *xi = ac->private;
24 int need;
25
26 ceph_x_validate_tickets(ac, &need);
27 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
28 ac->want_keys, need, xi->have_keys);
29 return (ac->want_keys & xi->have_keys) == ac->want_keys;
30}
31
32static int ceph_x_encrypt_buflen(int ilen)
33{
34 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
35 sizeof(u32);
36}
37
38static int ceph_x_encrypt(struct ceph_crypto_key *secret,
39 void *ibuf, int ilen, void *obuf, size_t olen)
40{
41 struct ceph_x_encrypt_header head = {
42 .struct_v = 1,
43 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
44 };
45 size_t len = olen - sizeof(u32);
46 int ret;
47
48 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
49 &head, sizeof(head), ibuf, ilen);
50 if (ret)
51 return ret;
52 ceph_encode_32(&obuf, len);
53 return len + sizeof(u32);
54}
55
56static int ceph_x_decrypt(struct ceph_crypto_key *secret,
57 void **p, void *end, void *obuf, size_t olen)
58{
59 struct ceph_x_encrypt_header head;
60 size_t head_len = sizeof(head);
61 int len, ret;
62
63 len = ceph_decode_32(p);
64 if (*p + len > end)
65 return -EINVAL;
66
67 dout("ceph_x_decrypt len %d\n", len);
68 ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
69 *p, len);
70 if (ret)
71 return ret;
72 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
73 return -EPERM;
74 *p += len;
75 return olen;
76}
77
78/*
79 * get existing (or insert new) ticket handler
80 */
81struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
82 int service)
83{
84 struct ceph_x_ticket_handler *th;
85 struct ceph_x_info *xi = ac->private;
86 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
87
88 while (*p) {
89 parent = *p;
90 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
91 if (service < th->service)
92 p = &(*p)->rb_left;
93 else if (service > th->service)
94 p = &(*p)->rb_right;
95 else
96 return th;
97 }
98
99 /* add it */
100 th = kzalloc(sizeof(*th), GFP_NOFS);
101 if (!th)
102 return ERR_PTR(-ENOMEM);
103 th->service = service;
104 rb_link_node(&th->node, parent, p);
105 rb_insert_color(&th->node, &xi->ticket_handlers);
106 return th;
107}
108
109static void remove_ticket_handler(struct ceph_auth_client *ac,
110 struct ceph_x_ticket_handler *th)
111{
112 struct ceph_x_info *xi = ac->private;
113
114 dout("remove_ticket_handler %p %d\n", th, th->service);
115 rb_erase(&th->node, &xi->ticket_handlers);
116 ceph_crypto_key_destroy(&th->session_key);
117 if (th->ticket_blob)
118 ceph_buffer_put(th->ticket_blob);
119 kfree(th);
120}
121
122static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
123 struct ceph_crypto_key *secret,
124 void *buf, void *end)
125{
126 struct ceph_x_info *xi = ac->private;
127 int num;
128 void *p = buf;
129 int ret;
130 char *dbuf;
131 char *ticket_buf;
132 u8 struct_v;
133
134 dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
135 if (!dbuf)
136 return -ENOMEM;
137
138 ret = -ENOMEM;
139 ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
140 GFP_NOFS | GFP_ATOMIC);
141 if (!ticket_buf)
142 goto out_dbuf;
143
144 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
145 struct_v = ceph_decode_8(&p);
146 if (struct_v != 1)
147 goto bad;
148 num = ceph_decode_32(&p);
149 dout("%d tickets\n", num);
150 while (num--) {
151 int type;
152 u8 struct_v;
153 struct ceph_x_ticket_handler *th;
154 void *dp, *dend;
155 int dlen;
156 char is_enc;
157 struct timespec validity;
158 struct ceph_crypto_key old_key;
159 void *tp, *tpend;
160 struct ceph_timespec new_validity;
161 struct ceph_crypto_key new_session_key;
162 struct ceph_buffer *new_ticket_blob;
163 unsigned long new_expires, new_renew_after;
164 u64 new_secret_id;
165
166 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
167
168 type = ceph_decode_32(&p);
169 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
170
171 struct_v = ceph_decode_8(&p);
172 if (struct_v != 1)
173 goto bad;
174
175 th = get_ticket_handler(ac, type);
176 if (IS_ERR(th)) {
177 ret = PTR_ERR(th);
178 goto out;
179 }
180
181 /* blob for me */
182 dlen = ceph_x_decrypt(secret, &p, end, dbuf,
183 TEMP_TICKET_BUF_LEN);
184 if (dlen <= 0) {
185 ret = dlen;
186 goto out;
187 }
188 dout(" decrypted %d bytes\n", dlen);
189 dend = dbuf + dlen;
190 dp = dbuf;
191
192 struct_v = ceph_decode_8(&dp);
193 if (struct_v != 1)
194 goto bad;
195
196 memcpy(&old_key, &th->session_key, sizeof(old_key));
197 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
198 if (ret)
199 goto out;
200
201 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
202 ceph_decode_timespec(&validity, &new_validity);
203 new_expires = get_seconds() + validity.tv_sec;
204 new_renew_after = new_expires - (validity.tv_sec / 4);
205 dout(" expires=%lu renew_after=%lu\n", new_expires,
206 new_renew_after);
207
208 /* ticket blob for service */
209 ceph_decode_8_safe(&p, end, is_enc, bad);
210 tp = ticket_buf;
211 if (is_enc) {
212 /* encrypted */
213 dout(" encrypted ticket\n");
214 dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
215 TEMP_TICKET_BUF_LEN);
216 if (dlen < 0) {
217 ret = dlen;
218 goto out;
219 }
220 dlen = ceph_decode_32(&tp);
221 } else {
222 /* unencrypted */
223 ceph_decode_32_safe(&p, end, dlen, bad);
224 ceph_decode_need(&p, end, dlen, bad);
225 ceph_decode_copy(&p, ticket_buf, dlen);
226 }
227 tpend = tp + dlen;
228 dout(" ticket blob is %d bytes\n", dlen);
229 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
230 struct_v = ceph_decode_8(&tp);
231 new_secret_id = ceph_decode_64(&tp);
232 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
233 if (ret)
234 goto out;
235
236 /* all is well, update our ticket */
237 ceph_crypto_key_destroy(&th->session_key);
238 if (th->ticket_blob)
239 ceph_buffer_put(th->ticket_blob);
240 th->session_key = new_session_key;
241 th->ticket_blob = new_ticket_blob;
242 th->validity = new_validity;
243 th->secret_id = new_secret_id;
244 th->expires = new_expires;
245 th->renew_after = new_renew_after;
246 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
247 type, ceph_entity_type_name(type), th->secret_id,
248 (int)th->ticket_blob->vec.iov_len);
249 xi->have_keys |= th->service;
250 }
251
252 ret = 0;
253out:
254 kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
255out_dbuf:
256 kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
257 return ret;
258
259bad:
260 ret = -EINVAL;
261 goto out;
262}
263
264static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
265 struct ceph_x_ticket_handler *th,
266 struct ceph_x_authorizer *au)
267{
268 int maxlen;
269 struct ceph_x_authorize_a *msg_a;
270 struct ceph_x_authorize_b msg_b;
271 void *p, *end;
272 int ret;
273 int ticket_blob_len =
274 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
275
276 dout("build_authorizer for %s %p\n",
277 ceph_entity_type_name(th->service), au);
278
279 maxlen = sizeof(*msg_a) + sizeof(msg_b) +
280 ceph_x_encrypt_buflen(ticket_blob_len);
281 dout(" need len %d\n", maxlen);
282 if (au->buf && au->buf->alloc_len < maxlen) {
283 ceph_buffer_put(au->buf);
284 au->buf = NULL;
285 }
286 if (!au->buf) {
287 au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
288 if (!au->buf)
289 return -ENOMEM;
290 }
291 au->service = th->service;
292
293 msg_a = au->buf->vec.iov_base;
294 msg_a->struct_v = 1;
295 msg_a->global_id = cpu_to_le64(ac->global_id);
296 msg_a->service_id = cpu_to_le32(th->service);
297 msg_a->ticket_blob.struct_v = 1;
298 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
299 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
300 if (ticket_blob_len) {
301 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
302 th->ticket_blob->vec.iov_len);
303 }
304 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
305 le64_to_cpu(msg_a->ticket_blob.secret_id));
306
307 p = msg_a + 1;
308 p += ticket_blob_len;
309 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
310
311 get_random_bytes(&au->nonce, sizeof(au->nonce));
312 msg_b.struct_v = 1;
313 msg_b.nonce = cpu_to_le64(au->nonce);
314 ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
315 p, end - p);
316 if (ret < 0)
317 goto out_buf;
318 p += ret;
319 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
320 dout(" built authorizer nonce %llx len %d\n", au->nonce,
321 (int)au->buf->vec.iov_len);
322 BUG_ON(au->buf->vec.iov_len > maxlen);
323 return 0;
324
325out_buf:
326 ceph_buffer_put(au->buf);
327 au->buf = NULL;
328 return ret;
329}
330
331static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
332 void **p, void *end)
333{
334 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
335 ceph_encode_8(p, 1);
336 ceph_encode_64(p, th->secret_id);
337 if (th->ticket_blob) {
338 const char *buf = th->ticket_blob->vec.iov_base;
339 u32 len = th->ticket_blob->vec.iov_len;
340
341 ceph_encode_32_safe(p, end, len, bad);
342 ceph_encode_copy_safe(p, end, buf, len, bad);
343 } else {
344 ceph_encode_32_safe(p, end, 0, bad);
345 }
346
347 return 0;
348bad:
349 return -ERANGE;
350}
351
352static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
353{
354 int want = ac->want_keys;
355 struct ceph_x_info *xi = ac->private;
356 int service;
357
358 *pneed = ac->want_keys & ~(xi->have_keys);
359
360 for (service = 1; service <= want; service <<= 1) {
361 struct ceph_x_ticket_handler *th;
362
363 if (!(ac->want_keys & service))
364 continue;
365
366 if (*pneed & service)
367 continue;
368
369 th = get_ticket_handler(ac, service);
370
371 if (!th) {
372 *pneed |= service;
373 continue;
374 }
375
376 if (get_seconds() >= th->renew_after)
377 *pneed |= service;
378 if (get_seconds() >= th->expires)
379 xi->have_keys &= ~service;
380 }
381}
382
383
384static int ceph_x_build_request(struct ceph_auth_client *ac,
385 void *buf, void *end)
386{
387 struct ceph_x_info *xi = ac->private;
388 int need;
389 struct ceph_x_request_header *head = buf;
390 int ret;
391 struct ceph_x_ticket_handler *th =
392 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
393
394 ceph_x_validate_tickets(ac, &need);
395
396 dout("build_request want %x have %x need %x\n",
397 ac->want_keys, xi->have_keys, need);
398
399 if (need & CEPH_ENTITY_TYPE_AUTH) {
400 struct ceph_x_authenticate *auth = (void *)(head + 1);
401 void *p = auth + 1;
402 struct ceph_x_challenge_blob tmp;
403 char tmp_enc[40];
404 u64 *u;
405
406 if (p > end)
407 return -ERANGE;
408
409 dout(" get_auth_session_key\n");
410 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
411
412 /* encrypt and hash */
413 get_random_bytes(&auth->client_challenge, sizeof(u64));
414 tmp.client_challenge = auth->client_challenge;
415 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
416 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
417 tmp_enc, sizeof(tmp_enc));
418 if (ret < 0)
419 return ret;
420
421 auth->struct_v = 1;
422 auth->key = 0;
423 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
424 auth->key ^= *u;
425 dout(" server_challenge %llx client_challenge %llx key %llx\n",
426 xi->server_challenge, le64_to_cpu(auth->client_challenge),
427 le64_to_cpu(auth->key));
428
429 /* now encode the old ticket if exists */
430 ret = ceph_x_encode_ticket(th, &p, end);
431 if (ret < 0)
432 return ret;
433
434 return p - buf;
435 }
436
437 if (need) {
438 void *p = head + 1;
439 struct ceph_x_service_ticket_request *req;
440
441 if (p > end)
442 return -ERANGE;
443 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
444
445 BUG_ON(!th);
446 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
447 if (ret)
448 return ret;
449 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
450 xi->auth_authorizer.buf->vec.iov_len);
451
452 req = p;
453 req->keys = cpu_to_le32(need);
454 p += sizeof(*req);
455 return p - buf;
456 }
457
458 return 0;
459}
460
461static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
462 void *buf, void *end)
463{
464 struct ceph_x_info *xi = ac->private;
465 struct ceph_x_reply_header *head = buf;
466 struct ceph_x_ticket_handler *th;
467 int len = end - buf;
468 int op;
469 int ret;
470
471 if (result)
472 return result; /* XXX hmm? */
473
474 if (xi->starting) {
475 /* it's a hello */
476 struct ceph_x_server_challenge *sc = buf;
477
478 if (len != sizeof(*sc))
479 return -EINVAL;
480 xi->server_challenge = le64_to_cpu(sc->server_challenge);
481 dout("handle_reply got server challenge %llx\n",
482 xi->server_challenge);
483 xi->starting = false;
484 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
485 return -EAGAIN;
486 }
487
488 op = le32_to_cpu(head->op);
489 result = le32_to_cpu(head->result);
490 dout("handle_reply op %d result %d\n", op, result);
491 switch (op) {
492 case CEPHX_GET_AUTH_SESSION_KEY:
493 /* verify auth key */
494 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
495 buf + sizeof(*head), end);
496 break;
497
498 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
499 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
500 BUG_ON(!th);
501 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
502 buf + sizeof(*head), end);
503 break;
504
505 default:
506 return -EINVAL;
507 }
508 if (ret)
509 return ret;
510 if (ac->want_keys == xi->have_keys)
511 return 0;
512 return -EAGAIN;
513}
514
515static int ceph_x_create_authorizer(
516 struct ceph_auth_client *ac, int peer_type,
517 struct ceph_authorizer **a,
518 void **buf, size_t *len,
519 void **reply_buf, size_t *reply_len)
520{
521 struct ceph_x_authorizer *au;
522 struct ceph_x_ticket_handler *th;
523 int ret;
524
525 th = get_ticket_handler(ac, peer_type);
526 if (IS_ERR(th))
527 return PTR_ERR(th);
528
529 au = kzalloc(sizeof(*au), GFP_NOFS);
530 if (!au)
531 return -ENOMEM;
532
533 ret = ceph_x_build_authorizer(ac, th, au);
534 if (ret) {
535 kfree(au);
536 return ret;
537 }
538
539 *a = (struct ceph_authorizer *)au;
540 *buf = au->buf->vec.iov_base;
541 *len = au->buf->vec.iov_len;
542 *reply_buf = au->reply_buf;
543 *reply_len = sizeof(au->reply_buf);
544 return 0;
545}
546
547static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
548 struct ceph_authorizer *a, size_t len)
549{
550 struct ceph_x_authorizer *au = (void *)a;
551 struct ceph_x_ticket_handler *th;
552 int ret = 0;
553 struct ceph_x_authorize_reply reply;
554 void *p = au->reply_buf;
555 void *end = p + sizeof(au->reply_buf);
556
557 th = get_ticket_handler(ac, au->service);
558 if (!th)
559 return -EIO; /* hrm! */
560 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
561 if (ret < 0)
562 return ret;
563 if (ret != sizeof(reply))
564 return -EPERM;
565
566 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
567 ret = -EPERM;
568 else
569 ret = 0;
570 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
571 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
572 return ret;
573}
574
575static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
576 struct ceph_authorizer *a)
577{
578 struct ceph_x_authorizer *au = (void *)a;
579
580 ceph_buffer_put(au->buf);
581 kfree(au);
582}
583
584
585static void ceph_x_reset(struct ceph_auth_client *ac)
586{
587 struct ceph_x_info *xi = ac->private;
588
589 dout("reset\n");
590 xi->starting = true;
591 xi->server_challenge = 0;
592}
593
594static void ceph_x_destroy(struct ceph_auth_client *ac)
595{
596 struct ceph_x_info *xi = ac->private;
597 struct rb_node *p;
598
599 dout("ceph_x_destroy %p\n", ac);
600 ceph_crypto_key_destroy(&xi->secret);
601
602 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
603 struct ceph_x_ticket_handler *th =
604 rb_entry(p, struct ceph_x_ticket_handler, node);
605 remove_ticket_handler(ac, th);
606 }
607
608 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
609
610 kfree(ac->private);
611 ac->private = NULL;
612}
613
614static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
615 int peer_type)
616{
617 struct ceph_x_ticket_handler *th;
618
619 th = get_ticket_handler(ac, peer_type);
620 if (th && !IS_ERR(th))
621 remove_ticket_handler(ac, th);
622}
623
624
625static const struct ceph_auth_client_ops ceph_x_ops = {
626 .is_authenticated = ceph_x_is_authenticated,
627 .build_request = ceph_x_build_request,
628 .handle_reply = ceph_x_handle_reply,
629 .create_authorizer = ceph_x_create_authorizer,
630 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
631 .destroy_authorizer = ceph_x_destroy_authorizer,
632 .invalidate_authorizer = ceph_x_invalidate_authorizer,
633 .reset = ceph_x_reset,
634 .destroy = ceph_x_destroy,
635};
636
637
638int ceph_x_init(struct ceph_auth_client *ac)
639{
640 struct ceph_x_info *xi;
641 int ret;
642
643 dout("ceph_x_init %p\n", ac);
644 xi = kzalloc(sizeof(*xi), GFP_NOFS);
645 if (!xi)
646 return -ENOMEM;
647
648 ret = -ENOMEM;
649 ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
650 TEMP_TICKET_BUF_LEN, 8,
651 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
652 NULL);
653 if (!ceph_x_ticketbuf_cachep)
654 goto done_nomem;
655 ret = -EINVAL;
656 if (!ac->secret) {
657 pr_err("no secret set (for auth_x protocol)\n");
658 goto done_nomem;
659 }
660
661 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
662 if (ret)
663 goto done_nomem;
664
665 xi->starting = true;
666 xi->ticket_handlers = RB_ROOT;
667
668 ac->protocol = CEPH_AUTH_CEPHX;
669 ac->private = xi;
670 ac->ops = &ceph_x_ops;
671 return 0;
672
673done_nomem:
674 kfree(xi);
675 if (ceph_x_ticketbuf_cachep)
676 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
677 return ret;
678}
679
680
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
new file mode 100644
index 000000000000..ff6f8180e681
--- /dev/null
+++ b/fs/ceph/auth_x.h
@@ -0,0 +1,49 @@
1#ifndef _FS_CEPH_AUTH_X_H
2#define _FS_CEPH_AUTH_X_H
3
4#include <linux/rbtree.h>
5
6#include "crypto.h"
7#include "auth.h"
8#include "auth_x_protocol.h"
9
10/*
11 * Handle ticket for a single service.
12 */
13struct ceph_x_ticket_handler {
14 struct rb_node node;
15 unsigned service;
16
17 struct ceph_crypto_key session_key;
18 struct ceph_timespec validity;
19
20 u64 secret_id;
21 struct ceph_buffer *ticket_blob;
22
23 unsigned long renew_after, expires;
24};
25
26
27struct ceph_x_authorizer {
28 struct ceph_buffer *buf;
29 unsigned service;
30 u64 nonce;
31 char reply_buf[128]; /* big enough for encrypted blob */
32};
33
34struct ceph_x_info {
35 struct ceph_crypto_key secret;
36
37 bool starting;
38 u64 server_challenge;
39
40 unsigned have_keys;
41 struct rb_root ticket_handlers;
42
43 struct ceph_x_authorizer auth_authorizer;
44};
45
46extern int ceph_x_init(struct ceph_auth_client *ac);
47
48#endif
49
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/fs/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
1#ifndef __FS_CEPH_AUTH_X_PROTOCOL
2#define __FS_CEPH_AUTH_X_PROTOCOL
3
4#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
5#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
6#define CEPHX_GET_ROTATING_KEY 0x0400
7
8/* common bits */
9struct ceph_x_ticket_blob {
10 __u8 struct_v;
11 __le64 secret_id;
12 __le32 blob_len;
13 char blob[];
14} __attribute__ ((packed));
15
16
17/* common request/reply headers */
18struct ceph_x_request_header {
19 __le16 op;
20} __attribute__ ((packed));
21
22struct ceph_x_reply_header {
23 __le16 op;
24 __le32 result;
25} __attribute__ ((packed));
26
27
28/* authenticate handshake */
29
30/* initial hello (no reply header) */
31struct ceph_x_server_challenge {
32 __u8 struct_v;
33 __le64 server_challenge;
34} __attribute__ ((packed));
35
36struct ceph_x_authenticate {
37 __u8 struct_v;
38 __le64 client_challenge;
39 __le64 key;
40 /* ticket blob */
41} __attribute__ ((packed));
42
43struct ceph_x_service_ticket_request {
44 __u8 struct_v;
45 __le32 keys;
46} __attribute__ ((packed));
47
48struct ceph_x_challenge_blob {
49 __le64 server_challenge;
50 __le64 client_challenge;
51} __attribute__ ((packed));
52
53
54
55/* authorize handshake */
56
57/*
58 * The authorizer consists of two pieces:
59 * a - service id, ticket blob
60 * b - encrypted with session key
61 */
62struct ceph_x_authorize_a {
63 __u8 struct_v;
64 __le64 global_id;
65 __le32 service_id;
66 struct ceph_x_ticket_blob ticket_blob;
67} __attribute__ ((packed));
68
69struct ceph_x_authorize_b {
70 __u8 struct_v;
71 __le64 nonce;
72} __attribute__ ((packed));
73
74struct ceph_x_authorize_reply {
75 __u8 struct_v;
76 __le64 nonce_plus_one;
77} __attribute__ ((packed));
78
79
80/*
81 * encyption bundle
82 */
83#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
84
85struct ceph_x_encrypt_header {
86 __u8 struct_v;
87 __le64 magic;
88} __attribute__ ((packed));
89
90#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644
index 000000000000..c67535d70aa6
--- /dev/null
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,81 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5
6#include "buffer.h"
7#include "decode.h"
8
9struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
10{
11 struct ceph_buffer *b;
12
13 b = kmalloc(sizeof(*b), gfp);
14 if (!b)
15 return NULL;
16
17 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
18 if (b->vec.iov_base) {
19 b->is_vmalloc = false;
20 } else {
21 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
22 if (!b->vec.iov_base) {
23 kfree(b);
24 return NULL;
25 }
26 b->is_vmalloc = true;
27 }
28
29 kref_init(&b->kref);
30 b->alloc_len = len;
31 b->vec.iov_len = len;
32 dout("buffer_new %p\n", b);
33 return b;
34}
35
36void ceph_buffer_release(struct kref *kref)
37{
38 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
39
40 dout("buffer_release %p\n", b);
41 if (b->vec.iov_base) {
42 if (b->is_vmalloc)
43 vfree(b->vec.iov_base);
44 else
45 kfree(b->vec.iov_base);
46 }
47 kfree(b);
48}
49
50int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
51{
52 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
53 if (b->vec.iov_base) {
54 b->is_vmalloc = false;
55 } else {
56 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
57 b->is_vmalloc = true;
58 }
59 if (!b->vec.iov_base)
60 return -ENOMEM;
61 b->alloc_len = len;
62 b->vec.iov_len = len;
63 return 0;
64}
65
66int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
67{
68 size_t len;
69
70 ceph_decode_need(p, end, sizeof(u32), bad);
71 len = ceph_decode_32(p);
72 dout("decode_buffer len %d\n", (int)len);
73 ceph_decode_need(p, end, len, bad);
74 *b = ceph_buffer_new(len, GFP_NOFS);
75 if (!*b)
76 return -ENOMEM;
77 ceph_decode_copy(p, (*b)->vec.iov_base, len);
78 return 0;
79bad:
80 return -EINVAL;
81}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644
index 000000000000..58d19014068f
--- /dev/null
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,39 @@
1#ifndef __FS_CEPH_BUFFER_H
2#define __FS_CEPH_BUFFER_H
3
4#include <linux/kref.h>
5#include <linux/mm.h>
6#include <linux/vmalloc.h>
7#include <linux/types.h>
8#include <linux/uio.h>
9
10/*
11 * a simple reference counted buffer.
12 *
13 * use kmalloc for small sizes (<= one page), vmalloc for larger
14 * sizes.
15 */
16struct ceph_buffer {
17 struct kref kref;
18 struct kvec vec;
19 size_t alloc_len;
20 bool is_vmalloc;
21};
22
23extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
24extern void ceph_buffer_release(struct kref *kref);
25
26static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
27{
28 kref_get(&b->kref);
29 return b;
30}
31
32static inline void ceph_buffer_put(struct ceph_buffer *b)
33{
34 kref_put(&b->kref, ceph_buffer_release);
35}
36
37extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
38
39#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 000000000000..aa2239fa9a3b
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2955 @@
1#include "ceph_debug.h"
2
3#include <linux/fs.h>
4#include <linux/kernel.h>
5#include <linux/sched.h>
6#include <linux/slab.h>
7#include <linux/vmalloc.h>
8#include <linux/wait.h>
9#include <linux/writeback.h>
10
11#include "super.h"
12#include "decode.h"
13#include "messenger.h"
14
15/*
16 * Capability management
17 *
18 * The Ceph metadata servers control client access to inode metadata
19 * and file data by issuing capabilities, granting clients permission
20 * to read and/or write both inode field and file data to OSDs
21 * (storage nodes). Each capability consists of a set of bits
22 * indicating which operations are allowed.
23 *
24 * If the client holds a *_SHARED cap, the client has a coherent value
25 * that can be safely read from the cached inode.
26 *
27 * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
28 * client is allowed to change inode attributes (e.g., file size,
29 * mtime), note its dirty state in the ceph_cap, and asynchronously
30 * flush that metadata change to the MDS.
31 *
32 * In the event of a conflicting operation (perhaps by another
33 * client), the MDS will revoke the conflicting client capabilities.
34 *
35 * In order for a client to cache an inode, it must hold a capability
36 * with at least one MDS server. When inodes are released, release
37 * notifications are batched and periodically sent en masse to the MDS
38 * cluster to release server state.
39 */
40
41
42/*
43 * Generate readable cap strings for debugging output.
44 */
45#define MAX_CAP_STR 20
46static char cap_str[MAX_CAP_STR][40];
47static DEFINE_SPINLOCK(cap_str_lock);
48static int last_cap_str;
49
50static char *gcap_string(char *s, int c)
51{
52 if (c & CEPH_CAP_GSHARED)
53 *s++ = 's';
54 if (c & CEPH_CAP_GEXCL)
55 *s++ = 'x';
56 if (c & CEPH_CAP_GCACHE)
57 *s++ = 'c';
58 if (c & CEPH_CAP_GRD)
59 *s++ = 'r';
60 if (c & CEPH_CAP_GWR)
61 *s++ = 'w';
62 if (c & CEPH_CAP_GBUFFER)
63 *s++ = 'b';
64 if (c & CEPH_CAP_GLAZYIO)
65 *s++ = 'l';
66 return s;
67}
68
69const char *ceph_cap_string(int caps)
70{
71 int i;
72 char *s;
73 int c;
74
75 spin_lock(&cap_str_lock);
76 i = last_cap_str++;
77 if (last_cap_str == MAX_CAP_STR)
78 last_cap_str = 0;
79 spin_unlock(&cap_str_lock);
80
81 s = cap_str[i];
82
83 if (caps & CEPH_CAP_PIN)
84 *s++ = 'p';
85
86 c = (caps >> CEPH_CAP_SAUTH) & 3;
87 if (c) {
88 *s++ = 'A';
89 s = gcap_string(s, c);
90 }
91
92 c = (caps >> CEPH_CAP_SLINK) & 3;
93 if (c) {
94 *s++ = 'L';
95 s = gcap_string(s, c);
96 }
97
98 c = (caps >> CEPH_CAP_SXATTR) & 3;
99 if (c) {
100 *s++ = 'X';
101 s = gcap_string(s, c);
102 }
103
104 c = caps >> CEPH_CAP_SFILE;
105 if (c) {
106 *s++ = 'F';
107 s = gcap_string(s, c);
108 }
109
110 if (s == cap_str[i])
111 *s++ = '-';
112 *s = 0;
113 return cap_str[i];
114}
115
116/*
117 * Cap reservations
118 *
119 * Maintain a global pool of preallocated struct ceph_caps, referenced
120 * by struct ceph_caps_reservations. This ensures that we preallocate
121 * memory needed to successfully process an MDS response. (If an MDS
122 * sends us cap information and we fail to process it, we will have
123 * problems due to the client and MDS being out of sync.)
124 *
125 * Reservations are 'owned' by a ceph_cap_reservation context.
126 */
127static spinlock_t caps_list_lock;
128static struct list_head caps_list; /* unused (reserved or unreserved) */
129static int caps_total_count; /* total caps allocated */
130static int caps_use_count; /* in use */
131static int caps_reserve_count; /* unused, reserved */
132static int caps_avail_count; /* unused, unreserved */
133static int caps_min_count; /* keep at least this many (unreserved) */
134
135void __init ceph_caps_init(void)
136{
137 INIT_LIST_HEAD(&caps_list);
138 spin_lock_init(&caps_list_lock);
139}
140
141void ceph_caps_finalize(void)
142{
143 struct ceph_cap *cap;
144
145 spin_lock(&caps_list_lock);
146 while (!list_empty(&caps_list)) {
147 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
148 list_del(&cap->caps_item);
149 kmem_cache_free(ceph_cap_cachep, cap);
150 }
151 caps_total_count = 0;
152 caps_avail_count = 0;
153 caps_use_count = 0;
154 caps_reserve_count = 0;
155 caps_min_count = 0;
156 spin_unlock(&caps_list_lock);
157}
158
159void ceph_adjust_min_caps(int delta)
160{
161 spin_lock(&caps_list_lock);
162 caps_min_count += delta;
163 BUG_ON(caps_min_count < 0);
164 spin_unlock(&caps_list_lock);
165}
166
167int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
168{
169 int i;
170 struct ceph_cap *cap;
171 int have;
172 int alloc = 0;
173 LIST_HEAD(newcaps);
174 int ret = 0;
175
176 dout("reserve caps ctx=%p need=%d\n", ctx, need);
177
178 /* first reserve any caps that are already allocated */
179 spin_lock(&caps_list_lock);
180 if (caps_avail_count >= need)
181 have = need;
182 else
183 have = caps_avail_count;
184 caps_avail_count -= have;
185 caps_reserve_count += have;
186 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
187 caps_avail_count);
188 spin_unlock(&caps_list_lock);
189
190 for (i = have; i < need; i++) {
191 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
192 if (!cap) {
193 ret = -ENOMEM;
194 goto out_alloc_count;
195 }
196 list_add(&cap->caps_item, &newcaps);
197 alloc++;
198 }
199 BUG_ON(have + alloc != need);
200
201 spin_lock(&caps_list_lock);
202 caps_total_count += alloc;
203 caps_reserve_count += alloc;
204 list_splice(&newcaps, &caps_list);
205
206 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
207 caps_avail_count);
208 spin_unlock(&caps_list_lock);
209
210 ctx->count = need;
211 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
212 ctx, caps_total_count, caps_use_count, caps_reserve_count,
213 caps_avail_count);
214 return 0;
215
216out_alloc_count:
217 /* we didn't manage to reserve as much as we needed */
218 pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
219 ctx, need, have);
220 return ret;
221}
222
223int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
224{
225 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
226 if (ctx->count) {
227 spin_lock(&caps_list_lock);
228 BUG_ON(caps_reserve_count < ctx->count);
229 caps_reserve_count -= ctx->count;
230 caps_avail_count += ctx->count;
231 ctx->count = 0;
232 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
233 caps_total_count, caps_use_count, caps_reserve_count,
234 caps_avail_count);
235 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
236 caps_avail_count);
237 spin_unlock(&caps_list_lock);
238 }
239 return 0;
240}
241
242static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
243{
244 struct ceph_cap *cap = NULL;
245
246 /* temporary, until we do something about cap import/export */
247 if (!ctx)
248 return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
249
250 spin_lock(&caps_list_lock);
251 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
252 ctx, ctx->count, caps_total_count, caps_use_count,
253 caps_reserve_count, caps_avail_count);
254 BUG_ON(!ctx->count);
255 BUG_ON(ctx->count > caps_reserve_count);
256 BUG_ON(list_empty(&caps_list));
257
258 ctx->count--;
259 caps_reserve_count--;
260 caps_use_count++;
261
262 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
263 list_del(&cap->caps_item);
264
265 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
266 caps_avail_count);
267 spin_unlock(&caps_list_lock);
268 return cap;
269}
270
271void ceph_put_cap(struct ceph_cap *cap)
272{
273 spin_lock(&caps_list_lock);
274 dout("put_cap %p %d = %d used + %d resv + %d avail\n",
275 cap, caps_total_count, caps_use_count,
276 caps_reserve_count, caps_avail_count);
277 caps_use_count--;
278 /*
279 * Keep some preallocated caps around (ceph_min_count), to
280 * avoid lots of free/alloc churn.
281 */
282 if (caps_avail_count >= caps_reserve_count + caps_min_count) {
283 caps_total_count--;
284 kmem_cache_free(ceph_cap_cachep, cap);
285 } else {
286 caps_avail_count++;
287 list_add(&cap->caps_item, &caps_list);
288 }
289
290 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
291 caps_avail_count);
292 spin_unlock(&caps_list_lock);
293}
294
295void ceph_reservation_status(struct ceph_client *client,
296 int *total, int *avail, int *used, int *reserved,
297 int *min)
298{
299 if (total)
300 *total = caps_total_count;
301 if (avail)
302 *avail = caps_avail_count;
303 if (used)
304 *used = caps_use_count;
305 if (reserved)
306 *reserved = caps_reserve_count;
307 if (min)
308 *min = caps_min_count;
309}
310
311/*
312 * Find ceph_cap for given mds, if any.
313 *
314 * Called with i_lock held.
315 */
316static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
317{
318 struct ceph_cap *cap;
319 struct rb_node *n = ci->i_caps.rb_node;
320
321 while (n) {
322 cap = rb_entry(n, struct ceph_cap, ci_node);
323 if (mds < cap->mds)
324 n = n->rb_left;
325 else if (mds > cap->mds)
326 n = n->rb_right;
327 else
328 return cap;
329 }
330 return NULL;
331}
332
333/*
334 * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
335 * -1.
336 */
337static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
338{
339 struct ceph_cap *cap;
340 int mds = -1;
341 struct rb_node *p;
342
343 /* prefer mds with WR|WRBUFFER|EXCL caps */
344 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
345 cap = rb_entry(p, struct ceph_cap, ci_node);
346 mds = cap->mds;
347 if (mseq)
348 *mseq = cap->mseq;
349 if (cap->issued & (CEPH_CAP_FILE_WR |
350 CEPH_CAP_FILE_BUFFER |
351 CEPH_CAP_FILE_EXCL))
352 break;
353 }
354 return mds;
355}
356
357int ceph_get_cap_mds(struct inode *inode)
358{
359 int mds;
360 spin_lock(&inode->i_lock);
361 mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
362 spin_unlock(&inode->i_lock);
363 return mds;
364}
365
366/*
367 * Called under i_lock.
368 */
369static void __insert_cap_node(struct ceph_inode_info *ci,
370 struct ceph_cap *new)
371{
372 struct rb_node **p = &ci->i_caps.rb_node;
373 struct rb_node *parent = NULL;
374 struct ceph_cap *cap = NULL;
375
376 while (*p) {
377 parent = *p;
378 cap = rb_entry(parent, struct ceph_cap, ci_node);
379 if (new->mds < cap->mds)
380 p = &(*p)->rb_left;
381 else if (new->mds > cap->mds)
382 p = &(*p)->rb_right;
383 else
384 BUG();
385 }
386
387 rb_link_node(&new->ci_node, parent, p);
388 rb_insert_color(&new->ci_node, &ci->i_caps);
389}
390
391/*
392 * (re)set cap hold timeouts, which control the delayed release
393 * of unused caps back to the MDS. Should be called on cap use.
394 */
395static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
396 struct ceph_inode_info *ci)
397{
398 struct ceph_mount_args *ma = mdsc->client->mount_args;
399
400 ci->i_hold_caps_min = round_jiffies(jiffies +
401 ma->caps_wanted_delay_min * HZ);
402 ci->i_hold_caps_max = round_jiffies(jiffies +
403 ma->caps_wanted_delay_max * HZ);
404 dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
405 ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
406}
407
408/*
409 * (Re)queue cap at the end of the delayed cap release list.
410 *
411 * If I_FLUSH is set, leave the inode at the front of the list.
412 *
413 * Caller holds i_lock
414 * -> we take mdsc->cap_delay_lock
415 */
416static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
417 struct ceph_inode_info *ci)
418{
419 __cap_set_timeouts(mdsc, ci);
420 dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
421 ci->i_ceph_flags, ci->i_hold_caps_max);
422 if (!mdsc->stopping) {
423 spin_lock(&mdsc->cap_delay_lock);
424 if (!list_empty(&ci->i_cap_delay_list)) {
425 if (ci->i_ceph_flags & CEPH_I_FLUSH)
426 goto no_change;
427 list_del_init(&ci->i_cap_delay_list);
428 }
429 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
430no_change:
431 spin_unlock(&mdsc->cap_delay_lock);
432 }
433}
434
435/*
436 * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
437 * indicating we should send a cap message to flush dirty metadata
438 * asap, and move to the front of the delayed cap list.
439 */
440static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
441 struct ceph_inode_info *ci)
442{
443 dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
444 spin_lock(&mdsc->cap_delay_lock);
445 ci->i_ceph_flags |= CEPH_I_FLUSH;
446 if (!list_empty(&ci->i_cap_delay_list))
447 list_del_init(&ci->i_cap_delay_list);
448 list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
449 spin_unlock(&mdsc->cap_delay_lock);
450}
451
452/*
453 * Cancel delayed work on cap.
454 *
455 * Caller must hold i_lock.
456 */
457static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
458 struct ceph_inode_info *ci)
459{
460 dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
461 if (list_empty(&ci->i_cap_delay_list))
462 return;
463 spin_lock(&mdsc->cap_delay_lock);
464 list_del_init(&ci->i_cap_delay_list);
465 spin_unlock(&mdsc->cap_delay_lock);
466}
467
468/*
469 * Common issue checks for add_cap, handle_cap_grant.
470 */
471static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
472 unsigned issued)
473{
474 unsigned had = __ceph_caps_issued(ci, NULL);
475
476 /*
477 * Each time we receive FILE_CACHE anew, we increment
478 * i_rdcache_gen.
479 */
480 if ((issued & CEPH_CAP_FILE_CACHE) &&
481 (had & CEPH_CAP_FILE_CACHE) == 0)
482 ci->i_rdcache_gen++;
483
484 /*
485 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
486 * don't know what happened to this directory while we didn't
487 * have the cap.
488 */
489 if ((issued & CEPH_CAP_FILE_SHARED) &&
490 (had & CEPH_CAP_FILE_SHARED) == 0) {
491 ci->i_shared_gen++;
492 if (S_ISDIR(ci->vfs_inode.i_mode)) {
493 dout(" marking %p NOT complete\n", &ci->vfs_inode);
494 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
495 }
496 }
497}
498
499/*
500 * Add a capability under the given MDS session.
501 *
502 * Caller should hold session snap_rwsem (read) and s_mutex.
503 *
504 * @fmode is the open file mode, if we are opening a file, otherwise
505 * it is < 0. (This is so we can atomically add the cap and add an
506 * open file reference to it.)
507 */
508int ceph_add_cap(struct inode *inode,
509 struct ceph_mds_session *session, u64 cap_id,
510 int fmode, unsigned issued, unsigned wanted,
511 unsigned seq, unsigned mseq, u64 realmino, int flags,
512 struct ceph_cap_reservation *caps_reservation)
513{
514 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
515 struct ceph_inode_info *ci = ceph_inode(inode);
516 struct ceph_cap *new_cap = NULL;
517 struct ceph_cap *cap;
518 int mds = session->s_mds;
519 int actual_wanted;
520
521 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
522 session->s_mds, cap_id, ceph_cap_string(issued), seq);
523
524 /*
525 * If we are opening the file, include file mode wanted bits
526 * in wanted.
527 */
528 if (fmode >= 0)
529 wanted |= ceph_caps_for_mode(fmode);
530
531retry:
532 spin_lock(&inode->i_lock);
533 cap = __get_cap_for_mds(ci, mds);
534 if (!cap) {
535 if (new_cap) {
536 cap = new_cap;
537 new_cap = NULL;
538 } else {
539 spin_unlock(&inode->i_lock);
540 new_cap = get_cap(caps_reservation);
541 if (new_cap == NULL)
542 return -ENOMEM;
543 goto retry;
544 }
545
546 cap->issued = 0;
547 cap->implemented = 0;
548 cap->mds = mds;
549 cap->mds_wanted = 0;
550
551 cap->ci = ci;
552 __insert_cap_node(ci, cap);
553
554 /* clear out old exporting info? (i.e. on cap import) */
555 if (ci->i_cap_exporting_mds == mds) {
556 ci->i_cap_exporting_issued = 0;
557 ci->i_cap_exporting_mseq = 0;
558 ci->i_cap_exporting_mds = -1;
559 }
560
561 /* add to session cap list */
562 cap->session = session;
563 spin_lock(&session->s_cap_lock);
564 list_add_tail(&cap->session_caps, &session->s_caps);
565 session->s_nr_caps++;
566 spin_unlock(&session->s_cap_lock);
567 }
568
569 if (!ci->i_snap_realm) {
570 /*
571 * add this inode to the appropriate snap realm
572 */
573 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
574 realmino);
575 if (realm) {
576 ceph_get_snap_realm(mdsc, realm);
577 spin_lock(&realm->inodes_with_caps_lock);
578 ci->i_snap_realm = realm;
579 list_add(&ci->i_snap_realm_item,
580 &realm->inodes_with_caps);
581 spin_unlock(&realm->inodes_with_caps_lock);
582 } else {
583 pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
584 realmino);
585 }
586 }
587
588 __check_cap_issue(ci, cap, issued);
589
590 /*
591 * If we are issued caps we don't want, or the mds' wanted
592 * value appears to be off, queue a check so we'll release
593 * later and/or update the mds wanted value.
594 */
595 actual_wanted = __ceph_caps_wanted(ci);
596 if ((wanted & ~actual_wanted) ||
597 (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
598 dout(" issued %s, mds wanted %s, actual %s, queueing\n",
599 ceph_cap_string(issued), ceph_cap_string(wanted),
600 ceph_cap_string(actual_wanted));
601 __cap_delay_requeue(mdsc, ci);
602 }
603
604 if (flags & CEPH_CAP_FLAG_AUTH)
605 ci->i_auth_cap = cap;
606 else if (ci->i_auth_cap == cap)
607 ci->i_auth_cap = NULL;
608
609 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
610 inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
611 ceph_cap_string(issued|cap->issued), seq, mds);
612 cap->cap_id = cap_id;
613 cap->issued = issued;
614 cap->implemented |= issued;
615 cap->mds_wanted |= wanted;
616 cap->seq = seq;
617 cap->issue_seq = seq;
618 cap->mseq = mseq;
619 cap->cap_gen = session->s_cap_gen;
620
621 if (fmode >= 0)
622 __ceph_get_fmode(ci, fmode);
623 spin_unlock(&inode->i_lock);
624 wake_up(&ci->i_cap_wq);
625 return 0;
626}
627
628/*
629 * Return true if cap has not timed out and belongs to the current
630 * generation of the MDS session (i.e. has not gone 'stale' due to
631 * us losing touch with the mds).
632 */
633static int __cap_is_valid(struct ceph_cap *cap)
634{
635 unsigned long ttl;
636 u32 gen;
637
638 spin_lock(&cap->session->s_cap_lock);
639 gen = cap->session->s_cap_gen;
640 ttl = cap->session->s_cap_ttl;
641 spin_unlock(&cap->session->s_cap_lock);
642
643 if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
644 dout("__cap_is_valid %p cap %p issued %s "
645 "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
646 cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
647 return 0;
648 }
649
650 return 1;
651}
652
653/*
654 * Return set of valid cap bits issued to us. Note that caps time
655 * out, and may be invalidated in bulk if the client session times out
656 * and session->s_cap_gen is bumped.
657 */
658int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
659{
660 int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
661 struct ceph_cap *cap;
662 struct rb_node *p;
663
664 if (implemented)
665 *implemented = 0;
666 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
667 cap = rb_entry(p, struct ceph_cap, ci_node);
668 if (!__cap_is_valid(cap))
669 continue;
670 dout("__ceph_caps_issued %p cap %p issued %s\n",
671 &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
672 have |= cap->issued;
673 if (implemented)
674 *implemented |= cap->implemented;
675 }
676 return have;
677}
678
679/*
680 * Get cap bits issued by caps other than @ocap
681 */
682int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
683{
684 int have = ci->i_snap_caps;
685 struct ceph_cap *cap;
686 struct rb_node *p;
687
688 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
689 cap = rb_entry(p, struct ceph_cap, ci_node);
690 if (cap == ocap)
691 continue;
692 if (!__cap_is_valid(cap))
693 continue;
694 have |= cap->issued;
695 }
696 return have;
697}
698
699/*
700 * Move a cap to the end of the LRU (oldest caps at list head, newest
701 * at list tail).
702 */
703static void __touch_cap(struct ceph_cap *cap)
704{
705 struct ceph_mds_session *s = cap->session;
706
707 spin_lock(&s->s_cap_lock);
708 if (s->s_cap_iterator == NULL) {
709 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
710 s->s_mds);
711 list_move_tail(&cap->session_caps, &s->s_caps);
712 } else {
713 dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
714 &cap->ci->vfs_inode, cap, s->s_mds);
715 }
716 spin_unlock(&s->s_cap_lock);
717}
718
719/*
720 * Check if we hold the given mask. If so, move the cap(s) to the
721 * front of their respective LRUs. (This is the preferred way for
722 * callers to check for caps they want.)
723 */
724int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
725{
726 struct ceph_cap *cap;
727 struct rb_node *p;
728 int have = ci->i_snap_caps;
729
730 if ((have & mask) == mask) {
731 dout("__ceph_caps_issued_mask %p snap issued %s"
732 " (mask %s)\n", &ci->vfs_inode,
733 ceph_cap_string(have),
734 ceph_cap_string(mask));
735 return 1;
736 }
737
738 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
739 cap = rb_entry(p, struct ceph_cap, ci_node);
740 if (!__cap_is_valid(cap))
741 continue;
742 if ((cap->issued & mask) == mask) {
743 dout("__ceph_caps_issued_mask %p cap %p issued %s"
744 " (mask %s)\n", &ci->vfs_inode, cap,
745 ceph_cap_string(cap->issued),
746 ceph_cap_string(mask));
747 if (touch)
748 __touch_cap(cap);
749 return 1;
750 }
751
752 /* does a combination of caps satisfy mask? */
753 have |= cap->issued;
754 if ((have & mask) == mask) {
755 dout("__ceph_caps_issued_mask %p combo issued %s"
756 " (mask %s)\n", &ci->vfs_inode,
757 ceph_cap_string(cap->issued),
758 ceph_cap_string(mask));
759 if (touch) {
760 struct rb_node *q;
761
762 /* touch this + preceeding caps */
763 __touch_cap(cap);
764 for (q = rb_first(&ci->i_caps); q != p;
765 q = rb_next(q)) {
766 cap = rb_entry(q, struct ceph_cap,
767 ci_node);
768 if (!__cap_is_valid(cap))
769 continue;
770 __touch_cap(cap);
771 }
772 }
773 return 1;
774 }
775 }
776
777 return 0;
778}
779
780/*
781 * Return true if mask caps are currently being revoked by an MDS.
782 */
783int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
784{
785 struct inode *inode = &ci->vfs_inode;
786 struct ceph_cap *cap;
787 struct rb_node *p;
788 int ret = 0;
789
790 spin_lock(&inode->i_lock);
791 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
792 cap = rb_entry(p, struct ceph_cap, ci_node);
793 if (__cap_is_valid(cap) &&
794 (cap->implemented & ~cap->issued & mask)) {
795 ret = 1;
796 break;
797 }
798 }
799 spin_unlock(&inode->i_lock);
800 dout("ceph_caps_revoking %p %s = %d\n", inode,
801 ceph_cap_string(mask), ret);
802 return ret;
803}
804
805int __ceph_caps_used(struct ceph_inode_info *ci)
806{
807 int used = 0;
808 if (ci->i_pin_ref)
809 used |= CEPH_CAP_PIN;
810 if (ci->i_rd_ref)
811 used |= CEPH_CAP_FILE_RD;
812 if (ci->i_rdcache_ref || ci->i_rdcache_gen)
813 used |= CEPH_CAP_FILE_CACHE;
814 if (ci->i_wr_ref)
815 used |= CEPH_CAP_FILE_WR;
816 if (ci->i_wrbuffer_ref)
817 used |= CEPH_CAP_FILE_BUFFER;
818 return used;
819}
820
821/*
822 * wanted, by virtue of open file modes
823 */
824int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
825{
826 int want = 0;
827 int mode;
828 for (mode = 0; mode < 4; mode++)
829 if (ci->i_nr_by_mode[mode])
830 want |= ceph_caps_for_mode(mode);
831 return want;
832}
833
834/*
835 * Return caps we have registered with the MDS(s) as 'wanted'.
836 */
837int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
838{
839 struct ceph_cap *cap;
840 struct rb_node *p;
841 int mds_wanted = 0;
842
843 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
844 cap = rb_entry(p, struct ceph_cap, ci_node);
845 if (!__cap_is_valid(cap))
846 continue;
847 mds_wanted |= cap->mds_wanted;
848 }
849 return mds_wanted;
850}
851
852/*
853 * called under i_lock
854 */
855static int __ceph_is_any_caps(struct ceph_inode_info *ci)
856{
857 return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
858}
859
860/*
861 * caller should hold i_lock.
862 * caller will not hold session s_mutex if called from destroy_inode.
863 */
864void __ceph_remove_cap(struct ceph_cap *cap)
865{
866 struct ceph_mds_session *session = cap->session;
867 struct ceph_inode_info *ci = cap->ci;
868 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
869
870 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
871
872 /* remove from inode list */
873 rb_erase(&cap->ci_node, &ci->i_caps);
874 cap->ci = NULL;
875 if (ci->i_auth_cap == cap)
876 ci->i_auth_cap = NULL;
877
878 /* remove from session list */
879 spin_lock(&session->s_cap_lock);
880 if (session->s_cap_iterator == cap) {
881 /* not yet, we are iterating over this very cap */
882 dout("__ceph_remove_cap delaying %p removal from session %p\n",
883 cap, cap->session);
884 } else {
885 list_del_init(&cap->session_caps);
886 session->s_nr_caps--;
887 cap->session = NULL;
888 }
889 spin_unlock(&session->s_cap_lock);
890
891 if (cap->session == NULL)
892 ceph_put_cap(cap);
893
894 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
895 struct ceph_snap_realm *realm = ci->i_snap_realm;
896 spin_lock(&realm->inodes_with_caps_lock);
897 list_del_init(&ci->i_snap_realm_item);
898 ci->i_snap_realm_counter++;
899 ci->i_snap_realm = NULL;
900 spin_unlock(&realm->inodes_with_caps_lock);
901 ceph_put_snap_realm(mdsc, realm);
902 }
903 if (!__ceph_is_any_real_caps(ci))
904 __cap_delay_cancel(mdsc, ci);
905}
906
907/*
908 * Build and send a cap message to the given MDS.
909 *
910 * Caller should be holding s_mutex.
911 */
912static int send_cap_msg(struct ceph_mds_session *session,
913 u64 ino, u64 cid, int op,
914 int caps, int wanted, int dirty,
915 u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
916 u64 size, u64 max_size,
917 struct timespec *mtime, struct timespec *atime,
918 u64 time_warp_seq,
919 uid_t uid, gid_t gid, mode_t mode,
920 u64 xattr_version,
921 struct ceph_buffer *xattrs_buf,
922 u64 follows)
923{
924 struct ceph_mds_caps *fc;
925 struct ceph_msg *msg;
926
927 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
928 " seq %u/%u mseq %u follows %lld size %llu/%llu"
929 " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
930 cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
931 ceph_cap_string(dirty),
932 seq, issue_seq, mseq, follows, size, max_size,
933 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
934
935 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
936 if (IS_ERR(msg))
937 return PTR_ERR(msg);
938
939 msg->hdr.tid = cpu_to_le64(flush_tid);
940
941 fc = msg->front.iov_base;
942 memset(fc, 0, sizeof(*fc));
943
944 fc->cap_id = cpu_to_le64(cid);
945 fc->op = cpu_to_le32(op);
946 fc->seq = cpu_to_le32(seq);
947 fc->issue_seq = cpu_to_le32(issue_seq);
948 fc->migrate_seq = cpu_to_le32(mseq);
949 fc->caps = cpu_to_le32(caps);
950 fc->wanted = cpu_to_le32(wanted);
951 fc->dirty = cpu_to_le32(dirty);
952 fc->ino = cpu_to_le64(ino);
953 fc->snap_follows = cpu_to_le64(follows);
954
955 fc->size = cpu_to_le64(size);
956 fc->max_size = cpu_to_le64(max_size);
957 if (mtime)
958 ceph_encode_timespec(&fc->mtime, mtime);
959 if (atime)
960 ceph_encode_timespec(&fc->atime, atime);
961 fc->time_warp_seq = cpu_to_le32(time_warp_seq);
962
963 fc->uid = cpu_to_le32(uid);
964 fc->gid = cpu_to_le32(gid);
965 fc->mode = cpu_to_le32(mode);
966
967 fc->xattr_version = cpu_to_le64(xattr_version);
968 if (xattrs_buf) {
969 msg->middle = ceph_buffer_get(xattrs_buf);
970 fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
971 msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
972 }
973
974 ceph_con_send(&session->s_con, msg);
975 return 0;
976}
977
978/*
979 * Queue cap releases when an inode is dropped from our cache. Since
980 * inode is about to be destroyed, there is no need for i_lock.
981 */
982void ceph_queue_caps_release(struct inode *inode)
983{
984 struct ceph_inode_info *ci = ceph_inode(inode);
985 struct rb_node *p;
986
987 p = rb_first(&ci->i_caps);
988 while (p) {
989 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
990 struct ceph_mds_session *session = cap->session;
991 struct ceph_msg *msg;
992 struct ceph_mds_cap_release *head;
993 struct ceph_mds_cap_item *item;
994
995 spin_lock(&session->s_cap_lock);
996 BUG_ON(!session->s_num_cap_releases);
997 msg = list_first_entry(&session->s_cap_releases,
998 struct ceph_msg, list_head);
999
1000 dout(" adding %p release to mds%d msg %p (%d left)\n",
1001 inode, session->s_mds, msg, session->s_num_cap_releases);
1002
1003 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1004 head = msg->front.iov_base;
1005 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1006 item = msg->front.iov_base + msg->front.iov_len;
1007 item->ino = cpu_to_le64(ceph_ino(inode));
1008 item->cap_id = cpu_to_le64(cap->cap_id);
1009 item->migrate_seq = cpu_to_le32(cap->mseq);
1010 item->seq = cpu_to_le32(cap->issue_seq);
1011
1012 session->s_num_cap_releases--;
1013
1014 msg->front.iov_len += sizeof(*item);
1015 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1016 dout(" release msg %p full\n", msg);
1017 list_move_tail(&msg->list_head,
1018 &session->s_cap_releases_done);
1019 } else {
1020 dout(" release msg %p at %d/%d (%d)\n", msg,
1021 (int)le32_to_cpu(head->num),
1022 (int)CEPH_CAPS_PER_RELEASE,
1023 (int)msg->front.iov_len);
1024 }
1025 spin_unlock(&session->s_cap_lock);
1026 p = rb_next(p);
1027 __ceph_remove_cap(cap);
1028 }
1029}
1030
1031/*
1032 * Send a cap msg on the given inode. Update our caps state, then
1033 * drop i_lock and send the message.
1034 *
1035 * Make note of max_size reported/requested from mds, revoked caps
1036 * that have now been implemented.
1037 *
1038 * Make half-hearted attempt ot to invalidate page cache if we are
1039 * dropping RDCACHE. Note that this will leave behind locked pages
1040 * that we'll then need to deal with elsewhere.
1041 *
1042 * Return non-zero if delayed release, or we experienced an error
1043 * such that the caller should requeue + retry later.
1044 *
1045 * called with i_lock, then drops it.
1046 * caller should hold snap_rwsem (read), s_mutex.
1047 */
1048static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1049 int op, int used, int want, int retain, int flushing,
1050 unsigned *pflush_tid)
1051 __releases(cap->ci->vfs_inode->i_lock)
1052{
1053 struct ceph_inode_info *ci = cap->ci;
1054 struct inode *inode = &ci->vfs_inode;
1055 u64 cap_id = cap->cap_id;
1056 int held, revoking, dropping, keep;
1057 u64 seq, issue_seq, mseq, time_warp_seq, follows;
1058 u64 size, max_size;
1059 struct timespec mtime, atime;
1060 int wake = 0;
1061 mode_t mode;
1062 uid_t uid;
1063 gid_t gid;
1064 struct ceph_mds_session *session;
1065 u64 xattr_version = 0;
1066 int delayed = 0;
1067 u64 flush_tid = 0;
1068 int i;
1069 int ret;
1070
1071 held = cap->issued | cap->implemented;
1072 revoking = cap->implemented & ~cap->issued;
1073 retain &= ~revoking;
1074 dropping = cap->issued & ~retain;
1075
1076 dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
1077 inode, cap, cap->session,
1078 ceph_cap_string(held), ceph_cap_string(held & retain),
1079 ceph_cap_string(revoking));
1080 BUG_ON((retain & CEPH_CAP_PIN) == 0);
1081
1082 session = cap->session;
1083
1084 /* don't release wanted unless we've waited a bit. */
1085 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1086 time_before(jiffies, ci->i_hold_caps_min)) {
1087 dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
1088 ceph_cap_string(cap->issued),
1089 ceph_cap_string(cap->issued & retain),
1090 ceph_cap_string(cap->mds_wanted),
1091 ceph_cap_string(want));
1092 want |= cap->mds_wanted;
1093 retain |= cap->issued;
1094 delayed = 1;
1095 }
1096 ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
1097
1098 cap->issued &= retain; /* drop bits we don't want */
1099 if (cap->implemented & ~cap->issued) {
1100 /*
1101 * Wake up any waiters on wanted -> needed transition.
1102 * This is due to the weird transition from buffered
1103 * to sync IO... we need to flush dirty pages _before_
1104 * allowing sync writes to avoid reordering.
1105 */
1106 wake = 1;
1107 }
1108 cap->implemented &= cap->issued | used;
1109 cap->mds_wanted = want;
1110
1111 if (flushing) {
1112 /*
1113 * assign a tid for flush operations so we can avoid
1114 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
1115 * clean type races. track latest tid for every bit
1116 * so we can handle flush AxFw, flush Fw, and have the
1117 * first ack clean Ax.
1118 */
1119 flush_tid = ++ci->i_cap_flush_last_tid;
1120 if (pflush_tid)
1121 *pflush_tid = flush_tid;
1122 dout(" cap_flush_tid %d\n", (int)flush_tid);
1123 for (i = 0; i < CEPH_CAP_BITS; i++)
1124 if (flushing & (1 << i))
1125 ci->i_cap_flush_tid[i] = flush_tid;
1126 }
1127
1128 keep = cap->implemented;
1129 seq = cap->seq;
1130 issue_seq = cap->issue_seq;
1131 mseq = cap->mseq;
1132 size = inode->i_size;
1133 ci->i_reported_size = size;
1134 max_size = ci->i_wanted_max_size;
1135 ci->i_requested_max_size = max_size;
1136 mtime = inode->i_mtime;
1137 atime = inode->i_atime;
1138 time_warp_seq = ci->i_time_warp_seq;
1139 follows = ci->i_snap_realm->cached_context->seq;
1140 uid = inode->i_uid;
1141 gid = inode->i_gid;
1142 mode = inode->i_mode;
1143
1144 if (dropping & CEPH_CAP_XATTR_EXCL) {
1145 __ceph_build_xattrs_blob(ci);
1146 xattr_version = ci->i_xattrs.version + 1;
1147 }
1148
1149 spin_unlock(&inode->i_lock);
1150
1151 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1152 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1153 size, max_size, &mtime, &atime, time_warp_seq,
1154 uid, gid, mode,
1155 xattr_version,
1156 (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
1157 follows);
1158 if (ret < 0) {
1159 dout("error sending cap msg, must requeue %p\n", inode);
1160 delayed = 1;
1161 }
1162
1163 if (wake)
1164 wake_up(&ci->i_cap_wq);
1165
1166 return delayed;
1167}
1168
1169/*
1170 * When a snapshot is taken, clients accumulate dirty metadata on
1171 * inodes with capabilities in ceph_cap_snaps to describe the file
1172 * state at the time the snapshot was taken. This must be flushed
1173 * asynchronously back to the MDS once sync writes complete and dirty
1174 * data is written out.
1175 *
1176 * Called under i_lock. Takes s_mutex as needed.
1177 */
1178void __ceph_flush_snaps(struct ceph_inode_info *ci,
1179 struct ceph_mds_session **psession)
1180{
1181 struct inode *inode = &ci->vfs_inode;
1182 int mds;
1183 struct ceph_cap_snap *capsnap;
1184 u32 mseq;
1185 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1186 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1187 session->s_mutex */
1188 u64 next_follows = 0; /* keep track of how far we've gotten through the
1189 i_cap_snaps list, and skip these entries next time
1190 around to avoid an infinite loop */
1191
1192 if (psession)
1193 session = *psession;
1194
1195 dout("__flush_snaps %p\n", inode);
1196retry:
1197 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1198 /* avoid an infiniute loop after retry */
1199 if (capsnap->follows < next_follows)
1200 continue;
1201 /*
1202 * we need to wait for sync writes to complete and for dirty
1203 * pages to be written out.
1204 */
1205 if (capsnap->dirty_pages || capsnap->writing)
1206 continue;
1207
1208 /*
1209 * if cap writeback already occurred, we should have dropped
1210 * the capsnap in ceph_put_wrbuffer_cap_refs.
1211 */
1212 BUG_ON(capsnap->dirty == 0);
1213
1214 /* pick mds, take s_mutex */
1215 mds = __ceph_get_cap_mds(ci, &mseq);
1216 if (session && session->s_mds != mds) {
1217 dout("oops, wrong session %p mutex\n", session);
1218 mutex_unlock(&session->s_mutex);
1219 ceph_put_mds_session(session);
1220 session = NULL;
1221 }
1222 if (!session) {
1223 spin_unlock(&inode->i_lock);
1224 mutex_lock(&mdsc->mutex);
1225 session = __ceph_lookup_mds_session(mdsc, mds);
1226 mutex_unlock(&mdsc->mutex);
1227 if (session) {
1228 dout("inverting session/ino locks on %p\n",
1229 session);
1230 mutex_lock(&session->s_mutex);
1231 }
1232 /*
1233 * if session == NULL, we raced against a cap
1234 * deletion. retry, and we'll get a better
1235 * @mds value next time.
1236 */
1237 spin_lock(&inode->i_lock);
1238 goto retry;
1239 }
1240
1241 capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
1242 atomic_inc(&capsnap->nref);
1243 if (!list_empty(&capsnap->flushing_item))
1244 list_del_init(&capsnap->flushing_item);
1245 list_add_tail(&capsnap->flushing_item,
1246 &session->s_cap_snaps_flushing);
1247 spin_unlock(&inode->i_lock);
1248
1249 dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
1250 inode, capsnap, next_follows, capsnap->size);
1251 send_cap_msg(session, ceph_vino(inode).ino, 0,
1252 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1253 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
1254 capsnap->size, 0,
1255 &capsnap->mtime, &capsnap->atime,
1256 capsnap->time_warp_seq,
1257 capsnap->uid, capsnap->gid, capsnap->mode,
1258 0, NULL,
1259 capsnap->follows);
1260
1261 next_follows = capsnap->follows + 1;
1262 ceph_put_cap_snap(capsnap);
1263
1264 spin_lock(&inode->i_lock);
1265 goto retry;
1266 }
1267
1268 /* we flushed them all; remove this inode from the queue */
1269 spin_lock(&mdsc->snap_flush_lock);
1270 list_del_init(&ci->i_snap_flush_item);
1271 spin_unlock(&mdsc->snap_flush_lock);
1272
1273 if (psession)
1274 *psession = session;
1275 else if (session) {
1276 mutex_unlock(&session->s_mutex);
1277 ceph_put_mds_session(session);
1278 }
1279}
1280
1281static void ceph_flush_snaps(struct ceph_inode_info *ci)
1282{
1283 struct inode *inode = &ci->vfs_inode;
1284
1285 spin_lock(&inode->i_lock);
1286 __ceph_flush_snaps(ci, NULL);
1287 spin_unlock(&inode->i_lock);
1288}
1289
1290/*
1291 * Mark caps dirty. If inode is newly dirty, add to the global dirty
1292 * list.
1293 */
1294void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1295{
1296 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
1297 struct inode *inode = &ci->vfs_inode;
1298 int was = ci->i_dirty_caps;
1299 int dirty = 0;
1300
1301 dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
1302 ceph_cap_string(mask), ceph_cap_string(was),
1303 ceph_cap_string(was | mask));
1304 ci->i_dirty_caps |= mask;
1305 if (was == 0) {
1306 dout(" inode %p now dirty\n", &ci->vfs_inode);
1307 BUG_ON(!list_empty(&ci->i_dirty_item));
1308 spin_lock(&mdsc->cap_dirty_lock);
1309 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1310 spin_unlock(&mdsc->cap_dirty_lock);
1311 if (ci->i_flushing_caps == 0) {
1312 igrab(inode);
1313 dirty |= I_DIRTY_SYNC;
1314 }
1315 }
1316 BUG_ON(list_empty(&ci->i_dirty_item));
1317 if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1318 (mask & CEPH_CAP_FILE_BUFFER))
1319 dirty |= I_DIRTY_DATASYNC;
1320 if (dirty)
1321 __mark_inode_dirty(inode, dirty);
1322 __cap_delay_requeue(mdsc, ci);
1323}
1324
1325/*
1326 * Add dirty inode to the flushing list. Assigned a seq number so we
1327 * can wait for caps to flush without starving.
1328 *
1329 * Called under i_lock.
1330 */
1331static int __mark_caps_flushing(struct inode *inode,
1332 struct ceph_mds_session *session)
1333{
1334 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1335 struct ceph_inode_info *ci = ceph_inode(inode);
1336 int flushing;
1337
1338 BUG_ON(ci->i_dirty_caps == 0);
1339 BUG_ON(list_empty(&ci->i_dirty_item));
1340
1341 flushing = ci->i_dirty_caps;
1342 dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1343 ceph_cap_string(flushing),
1344 ceph_cap_string(ci->i_flushing_caps),
1345 ceph_cap_string(ci->i_flushing_caps | flushing));
1346 ci->i_flushing_caps |= flushing;
1347 ci->i_dirty_caps = 0;
1348 dout(" inode %p now !dirty\n", inode);
1349
1350 spin_lock(&mdsc->cap_dirty_lock);
1351 list_del_init(&ci->i_dirty_item);
1352
1353 ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1354 if (list_empty(&ci->i_flushing_item)) {
1355 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1356 mdsc->num_cap_flushing++;
1357 dout(" inode %p now flushing seq %lld\n", inode,
1358 ci->i_cap_flush_seq);
1359 } else {
1360 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1361 dout(" inode %p now flushing (more) seq %lld\n", inode,
1362 ci->i_cap_flush_seq);
1363 }
1364 spin_unlock(&mdsc->cap_dirty_lock);
1365
1366 return flushing;
1367}
1368
1369/*
1370 * try to invalidate mapping pages without blocking.
1371 */
1372static int mapping_is_empty(struct address_space *mapping)
1373{
1374 struct page *page = find_get_page(mapping, 0);
1375
1376 if (!page)
1377 return 1;
1378
1379 put_page(page);
1380 return 0;
1381}
1382
1383static int try_nonblocking_invalidate(struct inode *inode)
1384{
1385 struct ceph_inode_info *ci = ceph_inode(inode);
1386 u32 invalidating_gen = ci->i_rdcache_gen;
1387
1388 spin_unlock(&inode->i_lock);
1389 invalidate_mapping_pages(&inode->i_data, 0, -1);
1390 spin_lock(&inode->i_lock);
1391
1392 if (mapping_is_empty(&inode->i_data) &&
1393 invalidating_gen == ci->i_rdcache_gen) {
1394 /* success. */
1395 dout("try_nonblocking_invalidate %p success\n", inode);
1396 ci->i_rdcache_gen = 0;
1397 ci->i_rdcache_revoking = 0;
1398 return 0;
1399 }
1400 dout("try_nonblocking_invalidate %p failed\n", inode);
1401 return -1;
1402}
1403
1404/*
1405 * Swiss army knife function to examine currently used and wanted
1406 * versus held caps. Release, flush, ack revoked caps to mds as
1407 * appropriate.
1408 *
1409 * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
1410 * cap release further.
1411 * CHECK_CAPS_AUTHONLY - we should only check the auth cap
1412 * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1413 * further delay.
1414 */
1415void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1416 struct ceph_mds_session *session)
1417 __releases(session->s_mutex)
1418{
1419 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
1420 struct ceph_mds_client *mdsc = &client->mdsc;
1421 struct inode *inode = &ci->vfs_inode;
1422 struct ceph_cap *cap;
1423 int file_wanted, used;
1424 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
1425 int issued, implemented, want, retain, revoking, flushing = 0;
1426 int mds = -1; /* keep track of how far we've gone through i_caps list
1427 to avoid an infinite loop on retry */
1428 struct rb_node *p;
1429 int tried_invalidate = 0;
1430 int delayed = 0, sent = 0, force_requeue = 0, num;
1431 int queue_invalidate = 0;
1432 int is_delayed = flags & CHECK_CAPS_NODELAY;
1433
1434 /* if we are unmounting, flush any unused caps immediately. */
1435 if (mdsc->stopping)
1436 is_delayed = 1;
1437
1438 spin_lock(&inode->i_lock);
1439
1440 if (ci->i_ceph_flags & CEPH_I_FLUSH)
1441 flags |= CHECK_CAPS_FLUSH;
1442
1443 /* flush snaps first time around only */
1444 if (!list_empty(&ci->i_cap_snaps))
1445 __ceph_flush_snaps(ci, &session);
1446 goto retry_locked;
1447retry:
1448 spin_lock(&inode->i_lock);
1449retry_locked:
1450 file_wanted = __ceph_caps_file_wanted(ci);
1451 used = __ceph_caps_used(ci);
1452 want = file_wanted | used;
1453 issued = __ceph_caps_issued(ci, &implemented);
1454 revoking = implemented & ~issued;
1455
1456 retain = want | CEPH_CAP_PIN;
1457 if (!mdsc->stopping && inode->i_nlink > 0) {
1458 if (want) {
1459 retain |= CEPH_CAP_ANY; /* be greedy */
1460 } else {
1461 retain |= CEPH_CAP_ANY_SHARED;
1462 /*
1463 * keep RD only if we didn't have the file open RW,
1464 * because then the mds would revoke it anyway to
1465 * journal max_size=0.
1466 */
1467 if (ci->i_max_size == 0)
1468 retain |= CEPH_CAP_ANY_RD;
1469 }
1470 }
1471
1472 dout("check_caps %p file_want %s used %s dirty %s flushing %s"
1473 " issued %s revoking %s retain %s %s%s%s\n", inode,
1474 ceph_cap_string(file_wanted),
1475 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
1476 ceph_cap_string(ci->i_flushing_caps),
1477 ceph_cap_string(issued), ceph_cap_string(revoking),
1478 ceph_cap_string(retain),
1479 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
1480 (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
1481 (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
1482
1483 /*
1484 * If we no longer need to hold onto old our caps, and we may
1485 * have cached pages, but don't want them, then try to invalidate.
1486 * If we fail, it's because pages are locked.... try again later.
1487 */
1488 if ((!is_delayed || mdsc->stopping) &&
1489 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1490 ci->i_rdcache_gen && /* may have cached pages */
1491 (file_wanted == 0 || /* no open files */
1492 (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */
1493 !tried_invalidate) {
1494 dout("check_caps trying to invalidate on %p\n", inode);
1495 if (try_nonblocking_invalidate(inode) < 0) {
1496 if (revoking & CEPH_CAP_FILE_CACHE) {
1497 dout("check_caps queuing invalidate\n");
1498 queue_invalidate = 1;
1499 ci->i_rdcache_revoking = ci->i_rdcache_gen;
1500 } else {
1501 dout("check_caps failed to invalidate pages\n");
1502 /* we failed to invalidate pages. check these
1503 caps again later. */
1504 force_requeue = 1;
1505 __cap_set_timeouts(mdsc, ci);
1506 }
1507 }
1508 tried_invalidate = 1;
1509 goto retry_locked;
1510 }
1511
1512 num = 0;
1513 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1514 cap = rb_entry(p, struct ceph_cap, ci_node);
1515 num++;
1516
1517 /* avoid looping forever */
1518 if (mds >= cap->mds ||
1519 ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
1520 continue;
1521
1522 /* NOTE: no side-effects allowed, until we take s_mutex */
1523
1524 revoking = cap->implemented & ~cap->issued;
1525 if (revoking)
1526 dout(" mds%d revoking %s\n", cap->mds,
1527 ceph_cap_string(revoking));
1528
1529 if (cap == ci->i_auth_cap &&
1530 (cap->issued & CEPH_CAP_FILE_WR)) {
1531 /* request larger max_size from MDS? */
1532 if (ci->i_wanted_max_size > ci->i_max_size &&
1533 ci->i_wanted_max_size > ci->i_requested_max_size) {
1534 dout("requesting new max_size\n");
1535 goto ack;
1536 }
1537
1538 /* approaching file_max? */
1539 if ((inode->i_size << 1) >= ci->i_max_size &&
1540 (ci->i_reported_size << 1) < ci->i_max_size) {
1541 dout("i_size approaching max_size\n");
1542 goto ack;
1543 }
1544 }
1545 /* flush anything dirty? */
1546 if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
1547 ci->i_dirty_caps) {
1548 dout("flushing dirty caps\n");
1549 goto ack;
1550 }
1551
1552 /* completed revocation? going down and there are no caps? */
1553 if (revoking && (revoking & used) == 0) {
1554 dout("completed revocation of %s\n",
1555 ceph_cap_string(cap->implemented & ~cap->issued));
1556 goto ack;
1557 }
1558
1559 /* want more caps from mds? */
1560 if (want & ~(cap->mds_wanted | cap->issued))
1561 goto ack;
1562
1563 /* things we might delay */
1564 if ((cap->issued & ~retain) == 0 &&
1565 cap->mds_wanted == want)
1566 continue; /* nope, all good */
1567
1568 if (is_delayed)
1569 goto ack;
1570
1571 /* delay? */
1572 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1573 time_before(jiffies, ci->i_hold_caps_max)) {
1574 dout(" delaying issued %s -> %s, wanted %s -> %s\n",
1575 ceph_cap_string(cap->issued),
1576 ceph_cap_string(cap->issued & retain),
1577 ceph_cap_string(cap->mds_wanted),
1578 ceph_cap_string(want));
1579 delayed++;
1580 continue;
1581 }
1582
1583ack:
1584 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1585 dout(" skipping %p I_NOFLUSH set\n", inode);
1586 continue;
1587 }
1588
1589 if (session && session != cap->session) {
1590 dout("oops, wrong session %p mutex\n", session);
1591 mutex_unlock(&session->s_mutex);
1592 session = NULL;
1593 }
1594 if (!session) {
1595 session = cap->session;
1596 if (mutex_trylock(&session->s_mutex) == 0) {
1597 dout("inverting session/ino locks on %p\n",
1598 session);
1599 spin_unlock(&inode->i_lock);
1600 if (took_snap_rwsem) {
1601 up_read(&mdsc->snap_rwsem);
1602 took_snap_rwsem = 0;
1603 }
1604 mutex_lock(&session->s_mutex);
1605 goto retry;
1606 }
1607 }
1608 /* take snap_rwsem after session mutex */
1609 if (!took_snap_rwsem) {
1610 if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
1611 dout("inverting snap/in locks on %p\n",
1612 inode);
1613 spin_unlock(&inode->i_lock);
1614 down_read(&mdsc->snap_rwsem);
1615 took_snap_rwsem = 1;
1616 goto retry;
1617 }
1618 took_snap_rwsem = 1;
1619 }
1620
1621 if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1622 flushing = __mark_caps_flushing(inode, session);
1623
1624 mds = cap->mds; /* remember mds, so we don't repeat */
1625 sent++;
1626
1627 /* __send_cap drops i_lock */
1628 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
1629 retain, flushing, NULL);
1630 goto retry; /* retake i_lock and restart our cap scan. */
1631 }
1632
1633 /*
1634 * Reschedule delayed caps release if we delayed anything,
1635 * otherwise cancel.
1636 */
1637 if (delayed && is_delayed)
1638 force_requeue = 1; /* __send_cap delayed release; requeue */
1639 if (!delayed && !is_delayed)
1640 __cap_delay_cancel(mdsc, ci);
1641 else if (!is_delayed || force_requeue)
1642 __cap_delay_requeue(mdsc, ci);
1643
1644 spin_unlock(&inode->i_lock);
1645
1646 if (queue_invalidate)
1647 ceph_queue_invalidate(inode);
1648
1649 if (session)
1650 mutex_unlock(&session->s_mutex);
1651 if (took_snap_rwsem)
1652 up_read(&mdsc->snap_rwsem);
1653}
1654
1655/*
1656 * Try to flush dirty caps back to the auth mds.
1657 */
1658static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1659 unsigned *flush_tid)
1660{
1661 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1662 struct ceph_inode_info *ci = ceph_inode(inode);
1663 int unlock_session = session ? 0 : 1;
1664 int flushing = 0;
1665
1666retry:
1667 spin_lock(&inode->i_lock);
1668 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1669 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
1670 goto out;
1671 }
1672 if (ci->i_dirty_caps && ci->i_auth_cap) {
1673 struct ceph_cap *cap = ci->i_auth_cap;
1674 int used = __ceph_caps_used(ci);
1675 int want = __ceph_caps_wanted(ci);
1676 int delayed;
1677
1678 if (!session) {
1679 spin_unlock(&inode->i_lock);
1680 session = cap->session;
1681 mutex_lock(&session->s_mutex);
1682 goto retry;
1683 }
1684 BUG_ON(session != cap->session);
1685 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
1686 goto out;
1687
1688 flushing = __mark_caps_flushing(inode, session);
1689
1690 /* __send_cap drops i_lock */
1691 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
1692 cap->issued | cap->implemented, flushing,
1693 flush_tid);
1694 if (!delayed)
1695 goto out_unlocked;
1696
1697 spin_lock(&inode->i_lock);
1698 __cap_delay_requeue(mdsc, ci);
1699 }
1700out:
1701 spin_unlock(&inode->i_lock);
1702out_unlocked:
1703 if (session && unlock_session)
1704 mutex_unlock(&session->s_mutex);
1705 return flushing;
1706}
1707
1708/*
1709 * Return true if we've flushed caps through the given flush_tid.
1710 */
1711static int caps_are_flushed(struct inode *inode, unsigned tid)
1712{
1713 struct ceph_inode_info *ci = ceph_inode(inode);
1714 int dirty, i, ret = 1;
1715
1716 spin_lock(&inode->i_lock);
1717 dirty = __ceph_caps_dirty(ci);
1718 for (i = 0; i < CEPH_CAP_BITS; i++)
1719 if ((ci->i_flushing_caps & (1 << i)) &&
1720 ci->i_cap_flush_tid[i] <= tid) {
1721 /* still flushing this bit */
1722 ret = 0;
1723 break;
1724 }
1725 spin_unlock(&inode->i_lock);
1726 return ret;
1727}
1728
1729/*
1730 * Wait on any unsafe replies for the given inode. First wait on the
1731 * newest request, and make that the upper bound. Then, if there are
1732 * more requests, keep waiting on the oldest as long as it is still older
1733 * than the original request.
1734 */
1735static void sync_write_wait(struct inode *inode)
1736{
1737 struct ceph_inode_info *ci = ceph_inode(inode);
1738 struct list_head *head = &ci->i_unsafe_writes;
1739 struct ceph_osd_request *req;
1740 u64 last_tid;
1741
1742 spin_lock(&ci->i_unsafe_lock);
1743 if (list_empty(head))
1744 goto out;
1745
1746 /* set upper bound as _last_ entry in chain */
1747 req = list_entry(head->prev, struct ceph_osd_request,
1748 r_unsafe_item);
1749 last_tid = req->r_tid;
1750
1751 do {
1752 ceph_osdc_get_request(req);
1753 spin_unlock(&ci->i_unsafe_lock);
1754 dout("sync_write_wait on tid %llu (until %llu)\n",
1755 req->r_tid, last_tid);
1756 wait_for_completion(&req->r_safe_completion);
1757 spin_lock(&ci->i_unsafe_lock);
1758 ceph_osdc_put_request(req);
1759
1760 /*
1761 * from here on look at first entry in chain, since we
1762 * only want to wait for anything older than last_tid
1763 */
1764 if (list_empty(head))
1765 break;
1766 req = list_entry(head->next, struct ceph_osd_request,
1767 r_unsafe_item);
1768 } while (req->r_tid < last_tid);
1769out:
1770 spin_unlock(&ci->i_unsafe_lock);
1771}
1772
1773int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
1774{
1775 struct inode *inode = dentry->d_inode;
1776 struct ceph_inode_info *ci = ceph_inode(inode);
1777 unsigned flush_tid;
1778 int ret;
1779 int dirty;
1780
1781 dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
1782 sync_write_wait(inode);
1783
1784 ret = filemap_write_and_wait(inode->i_mapping);
1785 if (ret < 0)
1786 return ret;
1787
1788 dirty = try_flush_caps(inode, NULL, &flush_tid);
1789 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
1790
1791 /*
1792 * only wait on non-file metadata writeback (the mds
1793 * can recover size and mtime, so we don't need to
1794 * wait for that)
1795 */
1796 if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
1797 dout("fsync waiting for flush_tid %u\n", flush_tid);
1798 ret = wait_event_interruptible(ci->i_cap_wq,
1799 caps_are_flushed(inode, flush_tid));
1800 }
1801
1802 dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
1803 return ret;
1804}
1805
1806/*
1807 * Flush any dirty caps back to the mds. If we aren't asked to wait,
1808 * queue inode for flush but don't do so immediately, because we can
1809 * get by with fewer MDS messages if we wait for data writeback to
1810 * complete first.
1811 */
1812int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1813{
1814 struct ceph_inode_info *ci = ceph_inode(inode);
1815 unsigned flush_tid;
1816 int err = 0;
1817 int dirty;
1818 int wait = wbc->sync_mode == WB_SYNC_ALL;
1819
1820 dout("write_inode %p wait=%d\n", inode, wait);
1821 if (wait) {
1822 dirty = try_flush_caps(inode, NULL, &flush_tid);
1823 if (dirty)
1824 err = wait_event_interruptible(ci->i_cap_wq,
1825 caps_are_flushed(inode, flush_tid));
1826 } else {
1827 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1828
1829 spin_lock(&inode->i_lock);
1830 if (__ceph_caps_dirty(ci))
1831 __cap_delay_requeue_front(mdsc, ci);
1832 spin_unlock(&inode->i_lock);
1833 }
1834 return err;
1835}
1836
1837/*
1838 * After a recovering MDS goes active, we need to resend any caps
1839 * we were flushing.
1840 *
1841 * Caller holds session->s_mutex.
1842 */
1843static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1844 struct ceph_mds_session *session)
1845{
1846 struct ceph_cap_snap *capsnap;
1847
1848 dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
1849 list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
1850 flushing_item) {
1851 struct ceph_inode_info *ci = capsnap->ci;
1852 struct inode *inode = &ci->vfs_inode;
1853 struct ceph_cap *cap;
1854
1855 spin_lock(&inode->i_lock);
1856 cap = ci->i_auth_cap;
1857 if (cap && cap->session == session) {
1858 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1859 cap, capsnap);
1860 __ceph_flush_snaps(ci, &session);
1861 } else {
1862 pr_err("%p auth cap %p not mds%d ???\n", inode,
1863 cap, session->s_mds);
1864 spin_unlock(&inode->i_lock);
1865 }
1866 }
1867}
1868
1869void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1870 struct ceph_mds_session *session)
1871{
1872 struct ceph_inode_info *ci;
1873
1874 kick_flushing_capsnaps(mdsc, session);
1875
1876 dout("kick_flushing_caps mds%d\n", session->s_mds);
1877 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
1878 struct inode *inode = &ci->vfs_inode;
1879 struct ceph_cap *cap;
1880 int delayed = 0;
1881
1882 spin_lock(&inode->i_lock);
1883 cap = ci->i_auth_cap;
1884 if (cap && cap->session == session) {
1885 dout("kick_flushing_caps %p cap %p %s\n", inode,
1886 cap, ceph_cap_string(ci->i_flushing_caps));
1887 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1888 __ceph_caps_used(ci),
1889 __ceph_caps_wanted(ci),
1890 cap->issued | cap->implemented,
1891 ci->i_flushing_caps, NULL);
1892 if (delayed) {
1893 spin_lock(&inode->i_lock);
1894 __cap_delay_requeue(mdsc, ci);
1895 spin_unlock(&inode->i_lock);
1896 }
1897 } else {
1898 pr_err("%p auth cap %p not mds%d ???\n", inode,
1899 cap, session->s_mds);
1900 spin_unlock(&inode->i_lock);
1901 }
1902 }
1903}
1904
1905
1906/*
1907 * Take references to capabilities we hold, so that we don't release
1908 * them to the MDS prematurely.
1909 *
1910 * Protected by i_lock.
1911 */
1912static void __take_cap_refs(struct ceph_inode_info *ci, int got)
1913{
1914 if (got & CEPH_CAP_PIN)
1915 ci->i_pin_ref++;
1916 if (got & CEPH_CAP_FILE_RD)
1917 ci->i_rd_ref++;
1918 if (got & CEPH_CAP_FILE_CACHE)
1919 ci->i_rdcache_ref++;
1920 if (got & CEPH_CAP_FILE_WR)
1921 ci->i_wr_ref++;
1922 if (got & CEPH_CAP_FILE_BUFFER) {
1923 if (ci->i_wrbuffer_ref == 0)
1924 igrab(&ci->vfs_inode);
1925 ci->i_wrbuffer_ref++;
1926 dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
1927 &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
1928 }
1929}
1930
1931/*
1932 * Try to grab cap references. Specify those refs we @want, and the
1933 * minimal set we @need. Also include the larger offset we are writing
1934 * to (when applicable), and check against max_size here as well.
1935 * Note that caller is responsible for ensuring max_size increases are
1936 * requested from the MDS.
1937 */
1938static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
1939 int *got, loff_t endoff, int *check_max, int *err)
1940{
1941 struct inode *inode = &ci->vfs_inode;
1942 int ret = 0;
1943 int have, implemented;
1944 int file_wanted;
1945
1946 dout("get_cap_refs %p need %s want %s\n", inode,
1947 ceph_cap_string(need), ceph_cap_string(want));
1948 spin_lock(&inode->i_lock);
1949
1950 /* make sure file is actually open */
1951 file_wanted = __ceph_caps_file_wanted(ci);
1952 if ((file_wanted & need) == 0) {
1953 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
1954 ceph_cap_string(need), ceph_cap_string(file_wanted));
1955 *err = -EBADF;
1956 ret = 1;
1957 goto out;
1958 }
1959
1960 if (need & CEPH_CAP_FILE_WR) {
1961 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
1962 dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
1963 inode, endoff, ci->i_max_size);
1964 if (endoff > ci->i_wanted_max_size) {
1965 *check_max = 1;
1966 ret = 1;
1967 }
1968 goto out;
1969 }
1970 /*
1971 * If a sync write is in progress, we must wait, so that we
1972 * can get a final snapshot value for size+mtime.
1973 */
1974 if (__ceph_have_pending_cap_snap(ci)) {
1975 dout("get_cap_refs %p cap_snap_pending\n", inode);
1976 goto out;
1977 }
1978 }
1979 have = __ceph_caps_issued(ci, &implemented);
1980
1981 /*
1982 * disallow writes while a truncate is pending
1983 */
1984 if (ci->i_truncate_pending)
1985 have &= ~CEPH_CAP_FILE_WR;
1986
1987 if ((have & need) == need) {
1988 /*
1989 * Look at (implemented & ~have & not) so that we keep waiting
1990 * on transition from wanted -> needed caps. This is needed
1991 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
1992 * going before a prior buffered writeback happens.
1993 */
1994 int not = want & ~(have & need);
1995 int revoking = implemented & ~have;
1996 dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
1997 inode, ceph_cap_string(have), ceph_cap_string(not),
1998 ceph_cap_string(revoking));
1999 if ((revoking & not) == 0) {
2000 *got = need | (have & want);
2001 __take_cap_refs(ci, *got);
2002 ret = 1;
2003 }
2004 } else {
2005 dout("get_cap_refs %p have %s needed %s\n", inode,
2006 ceph_cap_string(have), ceph_cap_string(need));
2007 }
2008out:
2009 spin_unlock(&inode->i_lock);
2010 dout("get_cap_refs %p ret %d got %s\n", inode,
2011 ret, ceph_cap_string(*got));
2012 return ret;
2013}
2014
2015/*
2016 * Check the offset we are writing up to against our current
2017 * max_size. If necessary, tell the MDS we want to write to
2018 * a larger offset.
2019 */
2020static void check_max_size(struct inode *inode, loff_t endoff)
2021{
2022 struct ceph_inode_info *ci = ceph_inode(inode);
2023 int check = 0;
2024
2025 /* do we need to explicitly request a larger max_size? */
2026 spin_lock(&inode->i_lock);
2027 if ((endoff >= ci->i_max_size ||
2028 endoff > (inode->i_size << 1)) &&
2029 endoff > ci->i_wanted_max_size) {
2030 dout("write %p at large endoff %llu, req max_size\n",
2031 inode, endoff);
2032 ci->i_wanted_max_size = endoff;
2033 check = 1;
2034 }
2035 spin_unlock(&inode->i_lock);
2036 if (check)
2037 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2038}
2039
2040/*
2041 * Wait for caps, and take cap references. If we can't get a WR cap
2042 * due to a small max_size, make sure we check_max_size (and possibly
2043 * ask the mds) so we don't get hung up indefinitely.
2044 */
2045int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
2046 loff_t endoff)
2047{
2048 int check_max, ret, err;
2049
2050retry:
2051 if (endoff > 0)
2052 check_max_size(&ci->vfs_inode, endoff);
2053 check_max = 0;
2054 err = 0;
2055 ret = wait_event_interruptible(ci->i_cap_wq,
2056 try_get_cap_refs(ci, need, want,
2057 got, endoff,
2058 &check_max, &err));
2059 if (err)
2060 ret = err;
2061 if (check_max)
2062 goto retry;
2063 return ret;
2064}
2065
2066/*
2067 * Take cap refs. Caller must already know we hold at least one ref
2068 * on the caps in question or we don't know this is safe.
2069 */
2070void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
2071{
2072 spin_lock(&ci->vfs_inode.i_lock);
2073 __take_cap_refs(ci, caps);
2074 spin_unlock(&ci->vfs_inode.i_lock);
2075}
2076
2077/*
2078 * Release cap refs.
2079 *
2080 * If we released the last ref on any given cap, call ceph_check_caps
2081 * to release (or schedule a release).
2082 *
2083 * If we are releasing a WR cap (from a sync write), finalize any affected
2084 * cap_snap, and wake up any waiters.
2085 */
2086void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2087{
2088 struct inode *inode = &ci->vfs_inode;
2089 int last = 0, put = 0, flushsnaps = 0, wake = 0;
2090 struct ceph_cap_snap *capsnap;
2091
2092 spin_lock(&inode->i_lock);
2093 if (had & CEPH_CAP_PIN)
2094 --ci->i_pin_ref;
2095 if (had & CEPH_CAP_FILE_RD)
2096 if (--ci->i_rd_ref == 0)
2097 last++;
2098 if (had & CEPH_CAP_FILE_CACHE)
2099 if (--ci->i_rdcache_ref == 0)
2100 last++;
2101 if (had & CEPH_CAP_FILE_BUFFER) {
2102 if (--ci->i_wrbuffer_ref == 0) {
2103 last++;
2104 put++;
2105 }
2106 dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
2107 inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
2108 }
2109 if (had & CEPH_CAP_FILE_WR)
2110 if (--ci->i_wr_ref == 0) {
2111 last++;
2112 if (!list_empty(&ci->i_cap_snaps)) {
2113 capsnap = list_first_entry(&ci->i_cap_snaps,
2114 struct ceph_cap_snap,
2115 ci_item);
2116 if (capsnap->writing) {
2117 capsnap->writing = 0;
2118 flushsnaps =
2119 __ceph_finish_cap_snap(ci,
2120 capsnap);
2121 wake = 1;
2122 }
2123 }
2124 }
2125 spin_unlock(&inode->i_lock);
2126
2127 dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
2128 last ? " last" : "", put ? " put" : "");
2129
2130 if (last && !flushsnaps)
2131 ceph_check_caps(ci, 0, NULL);
2132 else if (flushsnaps)
2133 ceph_flush_snaps(ci);
2134 if (wake)
2135 wake_up(&ci->i_cap_wq);
2136 if (put)
2137 iput(inode);
2138}
2139
2140/*
2141 * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
2142 * context. Adjust per-snap dirty page accounting as appropriate.
2143 * Once all dirty data for a cap_snap is flushed, flush snapped file
2144 * metadata back to the MDS. If we dropped the last ref, call
2145 * ceph_check_caps.
2146 */
2147void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2148 struct ceph_snap_context *snapc)
2149{
2150 struct inode *inode = &ci->vfs_inode;
2151 int last = 0;
2152 int complete_capsnap = 0;
2153 int drop_capsnap = 0;
2154 int found = 0;
2155 struct ceph_cap_snap *capsnap = NULL;
2156
2157 spin_lock(&inode->i_lock);
2158 ci->i_wrbuffer_ref -= nr;
2159 last = !ci->i_wrbuffer_ref;
2160
2161 if (ci->i_head_snapc == snapc) {
2162 ci->i_wrbuffer_ref_head -= nr;
2163 if (!ci->i_wrbuffer_ref_head) {
2164 ceph_put_snap_context(ci->i_head_snapc);
2165 ci->i_head_snapc = NULL;
2166 }
2167 dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
2168 inode,
2169 ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
2170 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
2171 last ? " LAST" : "");
2172 } else {
2173 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2174 if (capsnap->context == snapc) {
2175 found = 1;
2176 break;
2177 }
2178 }
2179 BUG_ON(!found);
2180 capsnap->dirty_pages -= nr;
2181 if (capsnap->dirty_pages == 0) {
2182 complete_capsnap = 1;
2183 if (capsnap->dirty == 0)
2184 /* cap writeback completed before we created
2185 * the cap_snap; no FLUSHSNAP is needed */
2186 drop_capsnap = 1;
2187 }
2188 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
2189 " snap %lld %d/%d -> %d/%d %s%s%s\n",
2190 inode, capsnap, capsnap->context->seq,
2191 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
2192 ci->i_wrbuffer_ref, capsnap->dirty_pages,
2193 last ? " (wrbuffer last)" : "",
2194 complete_capsnap ? " (complete capsnap)" : "",
2195 drop_capsnap ? " (drop capsnap)" : "");
2196 if (drop_capsnap) {
2197 ceph_put_snap_context(capsnap->context);
2198 list_del(&capsnap->ci_item);
2199 list_del(&capsnap->flushing_item);
2200 ceph_put_cap_snap(capsnap);
2201 }
2202 }
2203
2204 spin_unlock(&inode->i_lock);
2205
2206 if (last) {
2207 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2208 iput(inode);
2209 } else if (complete_capsnap) {
2210 ceph_flush_snaps(ci);
2211 wake_up(&ci->i_cap_wq);
2212 }
2213 if (drop_capsnap)
2214 iput(inode);
2215}
2216
2217/*
2218 * Handle a cap GRANT message from the MDS. (Note that a GRANT may
2219 * actually be a revocation if it specifies a smaller cap set.)
2220 *
2221 * caller holds s_mutex and i_lock, we drop both.
2222 *
2223 * return value:
2224 * 0 - ok
2225 * 1 - check_caps on auth cap only (writeback)
2226 * 2 - check_caps (ack revoke)
2227 */
2228static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2229 struct ceph_mds_session *session,
2230 struct ceph_cap *cap,
2231 struct ceph_buffer *xattr_buf)
2232 __releases(inode->i_lock)
2233 __releases(session->s_mutex)
2234{
2235 struct ceph_inode_info *ci = ceph_inode(inode);
2236 int mds = session->s_mds;
2237 int seq = le32_to_cpu(grant->seq);
2238 int newcaps = le32_to_cpu(grant->caps);
2239 int issued, implemented, used, wanted, dirty;
2240 u64 size = le64_to_cpu(grant->size);
2241 u64 max_size = le64_to_cpu(grant->max_size);
2242 struct timespec mtime, atime, ctime;
2243 int check_caps = 0;
2244 int wake = 0;
2245 int writeback = 0;
2246 int revoked_rdcache = 0;
2247 int queue_invalidate = 0;
2248
2249 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2250 inode, cap, mds, seq, ceph_cap_string(newcaps));
2251 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2252 inode->i_size);
2253
2254 /*
2255 * If CACHE is being revoked, and we have no dirty buffers,
2256 * try to invalidate (once). (If there are dirty buffers, we
2257 * will invalidate _after_ writeback.)
2258 */
2259 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2260 !ci->i_wrbuffer_ref) {
2261 if (try_nonblocking_invalidate(inode) == 0) {
2262 revoked_rdcache = 1;
2263 } else {
2264 /* there were locked pages.. invalidate later
2265 in a separate thread. */
2266 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2267 queue_invalidate = 1;
2268 ci->i_rdcache_revoking = ci->i_rdcache_gen;
2269 }
2270 }
2271 }
2272
2273 /* side effects now are allowed */
2274
2275 issued = __ceph_caps_issued(ci, &implemented);
2276 issued |= implemented | __ceph_caps_dirty(ci);
2277
2278 cap->cap_gen = session->s_cap_gen;
2279
2280 __check_cap_issue(ci, cap, newcaps);
2281
2282 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
2283 inode->i_mode = le32_to_cpu(grant->mode);
2284 inode->i_uid = le32_to_cpu(grant->uid);
2285 inode->i_gid = le32_to_cpu(grant->gid);
2286 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
2287 inode->i_uid, inode->i_gid);
2288 }
2289
2290 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
2291 inode->i_nlink = le32_to_cpu(grant->nlink);
2292
2293 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
2294 int len = le32_to_cpu(grant->xattr_len);
2295 u64 version = le64_to_cpu(grant->xattr_version);
2296
2297 if (version > ci->i_xattrs.version) {
2298 dout(" got new xattrs v%llu on %p len %d\n",
2299 version, inode, len);
2300 if (ci->i_xattrs.blob)
2301 ceph_buffer_put(ci->i_xattrs.blob);
2302 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
2303 ci->i_xattrs.version = version;
2304 }
2305 }
2306
2307 /* size/ctime/mtime/atime? */
2308 ceph_fill_file_size(inode, issued,
2309 le32_to_cpu(grant->truncate_seq),
2310 le64_to_cpu(grant->truncate_size), size);
2311 ceph_decode_timespec(&mtime, &grant->mtime);
2312 ceph_decode_timespec(&atime, &grant->atime);
2313 ceph_decode_timespec(&ctime, &grant->ctime);
2314 ceph_fill_file_time(inode, issued,
2315 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
2316 &atime);
2317
2318 /* max size increase? */
2319 if (max_size != ci->i_max_size) {
2320 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
2321 ci->i_max_size = max_size;
2322 if (max_size >= ci->i_wanted_max_size) {
2323 ci->i_wanted_max_size = 0; /* reset */
2324 ci->i_requested_max_size = 0;
2325 }
2326 wake = 1;
2327 }
2328
2329 /* check cap bits */
2330 wanted = __ceph_caps_wanted(ci);
2331 used = __ceph_caps_used(ci);
2332 dirty = __ceph_caps_dirty(ci);
2333 dout(" my wanted = %s, used = %s, dirty %s\n",
2334 ceph_cap_string(wanted),
2335 ceph_cap_string(used),
2336 ceph_cap_string(dirty));
2337 if (wanted != le32_to_cpu(grant->wanted)) {
2338 dout("mds wanted %s -> %s\n",
2339 ceph_cap_string(le32_to_cpu(grant->wanted)),
2340 ceph_cap_string(wanted));
2341 grant->wanted = cpu_to_le32(wanted);
2342 }
2343
2344 cap->seq = seq;
2345
2346 /* file layout may have changed */
2347 ci->i_layout = grant->layout;
2348
2349 /* revocation, grant, or no-op? */
2350 if (cap->issued & ~newcaps) {
2351 dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
2352 ceph_cap_string(newcaps));
2353 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
2354 writeback = 1; /* will delay ack */
2355 else if (dirty & ~newcaps)
2356 check_caps = 1; /* initiate writeback in check_caps */
2357 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
2358 revoked_rdcache)
2359 check_caps = 2; /* send revoke ack in check_caps */
2360 cap->issued = newcaps;
2361 cap->implemented |= newcaps;
2362 } else if (cap->issued == newcaps) {
2363 dout("caps unchanged: %s -> %s\n",
2364 ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
2365 } else {
2366 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
2367 ceph_cap_string(newcaps));
2368 cap->issued = newcaps;
2369 cap->implemented |= newcaps; /* add bits only, to
2370 * avoid stepping on a
2371 * pending revocation */
2372 wake = 1;
2373 }
2374 BUG_ON(cap->issued & ~cap->implemented);
2375
2376 spin_unlock(&inode->i_lock);
2377 if (writeback)
2378 /*
2379 * queue inode for writeback: we can't actually call
2380 * filemap_write_and_wait, etc. from message handler
2381 * context.
2382 */
2383 ceph_queue_writeback(inode);
2384 if (queue_invalidate)
2385 ceph_queue_invalidate(inode);
2386 if (wake)
2387 wake_up(&ci->i_cap_wq);
2388
2389 if (check_caps == 1)
2390 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
2391 session);
2392 else if (check_caps == 2)
2393 ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
2394 else
2395 mutex_unlock(&session->s_mutex);
2396}
2397
2398/*
2399 * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
2400 * MDS has been safely committed.
2401 */
2402static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2403 struct ceph_mds_caps *m,
2404 struct ceph_mds_session *session,
2405 struct ceph_cap *cap)
2406 __releases(inode->i_lock)
2407{
2408 struct ceph_inode_info *ci = ceph_inode(inode);
2409 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
2410 unsigned seq = le32_to_cpu(m->seq);
2411 int dirty = le32_to_cpu(m->dirty);
2412 int cleaned = 0;
2413 int drop = 0;
2414 int i;
2415
2416 for (i = 0; i < CEPH_CAP_BITS; i++)
2417 if ((dirty & (1 << i)) &&
2418 flush_tid == ci->i_cap_flush_tid[i])
2419 cleaned |= 1 << i;
2420
2421 dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
2422 " flushing %s -> %s\n",
2423 inode, session->s_mds, seq, ceph_cap_string(dirty),
2424 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
2425 ceph_cap_string(ci->i_flushing_caps & ~cleaned));
2426
2427 if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
2428 goto out;
2429
2430 ci->i_flushing_caps &= ~cleaned;
2431
2432 spin_lock(&mdsc->cap_dirty_lock);
2433 if (ci->i_flushing_caps == 0) {
2434 list_del_init(&ci->i_flushing_item);
2435 if (!list_empty(&session->s_cap_flushing))
2436 dout(" mds%d still flushing cap on %p\n",
2437 session->s_mds,
2438 &list_entry(session->s_cap_flushing.next,
2439 struct ceph_inode_info,
2440 i_flushing_item)->vfs_inode);
2441 mdsc->num_cap_flushing--;
2442 wake_up(&mdsc->cap_flushing_wq);
2443 dout(" inode %p now !flushing\n", inode);
2444
2445 if (ci->i_dirty_caps == 0) {
2446 dout(" inode %p now clean\n", inode);
2447 BUG_ON(!list_empty(&ci->i_dirty_item));
2448 drop = 1;
2449 } else {
2450 BUG_ON(list_empty(&ci->i_dirty_item));
2451 }
2452 }
2453 spin_unlock(&mdsc->cap_dirty_lock);
2454 wake_up(&ci->i_cap_wq);
2455
2456out:
2457 spin_unlock(&inode->i_lock);
2458 if (drop)
2459 iput(inode);
2460}
2461
2462/*
2463 * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
2464 * throw away our cap_snap.
2465 *
2466 * Caller hold s_mutex.
2467 */
2468static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2469 struct ceph_mds_caps *m,
2470 struct ceph_mds_session *session)
2471{
2472 struct ceph_inode_info *ci = ceph_inode(inode);
2473 u64 follows = le64_to_cpu(m->snap_follows);
2474 struct ceph_cap_snap *capsnap;
2475 int drop = 0;
2476
2477 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
2478 inode, ci, session->s_mds, follows);
2479
2480 spin_lock(&inode->i_lock);
2481 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2482 if (capsnap->follows == follows) {
2483 if (capsnap->flush_tid != flush_tid) {
2484 dout(" cap_snap %p follows %lld tid %lld !="
2485 " %lld\n", capsnap, follows,
2486 flush_tid, capsnap->flush_tid);
2487 break;
2488 }
2489 WARN_ON(capsnap->dirty_pages || capsnap->writing);
2490 dout(" removing %p cap_snap %p follows %lld\n",
2491 inode, capsnap, follows);
2492 ceph_put_snap_context(capsnap->context);
2493 list_del(&capsnap->ci_item);
2494 list_del(&capsnap->flushing_item);
2495 ceph_put_cap_snap(capsnap);
2496 drop = 1;
2497 break;
2498 } else {
2499 dout(" skipping cap_snap %p follows %lld\n",
2500 capsnap, capsnap->follows);
2501 }
2502 }
2503 spin_unlock(&inode->i_lock);
2504 if (drop)
2505 iput(inode);
2506}
2507
2508/*
2509 * Handle TRUNC from MDS, indicating file truncation.
2510 *
2511 * caller hold s_mutex.
2512 */
2513static void handle_cap_trunc(struct inode *inode,
2514 struct ceph_mds_caps *trunc,
2515 struct ceph_mds_session *session)
2516 __releases(inode->i_lock)
2517{
2518 struct ceph_inode_info *ci = ceph_inode(inode);
2519 int mds = session->s_mds;
2520 int seq = le32_to_cpu(trunc->seq);
2521 u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
2522 u64 truncate_size = le64_to_cpu(trunc->truncate_size);
2523 u64 size = le64_to_cpu(trunc->size);
2524 int implemented = 0;
2525 int dirty = __ceph_caps_dirty(ci);
2526 int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
2527 int queue_trunc = 0;
2528
2529 issued |= implemented | dirty;
2530
2531 dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
2532 inode, mds, seq, truncate_size, truncate_seq);
2533 queue_trunc = ceph_fill_file_size(inode, issued,
2534 truncate_seq, truncate_size, size);
2535 spin_unlock(&inode->i_lock);
2536
2537 if (queue_trunc)
2538 ceph_queue_vmtruncate(inode);
2539}
2540
2541/*
2542 * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
2543 * different one. If we are the most recent migration we've seen (as
2544 * indicated by mseq), make note of the migrating cap bits for the
2545 * duration (until we see the corresponding IMPORT).
2546 *
2547 * caller holds s_mutex
2548 */
2549static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2550 struct ceph_mds_session *session)
2551{
2552 struct ceph_inode_info *ci = ceph_inode(inode);
2553 int mds = session->s_mds;
2554 unsigned mseq = le32_to_cpu(ex->migrate_seq);
2555 struct ceph_cap *cap = NULL, *t;
2556 struct rb_node *p;
2557 int remember = 1;
2558
2559 dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
2560 inode, ci, mds, mseq);
2561
2562 spin_lock(&inode->i_lock);
2563
2564 /* make sure we haven't seen a higher mseq */
2565 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2566 t = rb_entry(p, struct ceph_cap, ci_node);
2567 if (ceph_seq_cmp(t->mseq, mseq) > 0) {
2568 dout(" higher mseq on cap from mds%d\n",
2569 t->session->s_mds);
2570 remember = 0;
2571 }
2572 if (t->session->s_mds == mds)
2573 cap = t;
2574 }
2575
2576 if (cap) {
2577 if (remember) {
2578 /* make note */
2579 ci->i_cap_exporting_mds = mds;
2580 ci->i_cap_exporting_mseq = mseq;
2581 ci->i_cap_exporting_issued = cap->issued;
2582 }
2583 __ceph_remove_cap(cap);
2584 }
2585 /* else, we already released it */
2586
2587 spin_unlock(&inode->i_lock);
2588}
2589
2590/*
2591 * Handle cap IMPORT. If there are temp bits from an older EXPORT,
2592 * clean them up.
2593 *
2594 * caller holds s_mutex.
2595 */
2596static void handle_cap_import(struct ceph_mds_client *mdsc,
2597 struct inode *inode, struct ceph_mds_caps *im,
2598 struct ceph_mds_session *session,
2599 void *snaptrace, int snaptrace_len)
2600{
2601 struct ceph_inode_info *ci = ceph_inode(inode);
2602 int mds = session->s_mds;
2603 unsigned issued = le32_to_cpu(im->caps);
2604 unsigned wanted = le32_to_cpu(im->wanted);
2605 unsigned seq = le32_to_cpu(im->seq);
2606 unsigned mseq = le32_to_cpu(im->migrate_seq);
2607 u64 realmino = le64_to_cpu(im->realm);
2608 u64 cap_id = le64_to_cpu(im->cap_id);
2609
2610 if (ci->i_cap_exporting_mds >= 0 &&
2611 ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
2612 dout("handle_cap_import inode %p ci %p mds%d mseq %d"
2613 " - cleared exporting from mds%d\n",
2614 inode, ci, mds, mseq,
2615 ci->i_cap_exporting_mds);
2616 ci->i_cap_exporting_issued = 0;
2617 ci->i_cap_exporting_mseq = 0;
2618 ci->i_cap_exporting_mds = -1;
2619 } else {
2620 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
2621 inode, ci, mds, mseq);
2622 }
2623
2624 down_write(&mdsc->snap_rwsem);
2625 ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
2626 false);
2627 downgrade_write(&mdsc->snap_rwsem);
2628 ceph_add_cap(inode, session, cap_id, -1,
2629 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2630 NULL /* no caps context */);
2631 try_flush_caps(inode, session, NULL);
2632 up_read(&mdsc->snap_rwsem);
2633}
2634
2635/*
2636 * Handle a caps message from the MDS.
2637 *
2638 * Identify the appropriate session, inode, and call the right handler
2639 * based on the cap op.
2640 */
2641void ceph_handle_caps(struct ceph_mds_session *session,
2642 struct ceph_msg *msg)
2643{
2644 struct ceph_mds_client *mdsc = session->s_mdsc;
2645 struct super_block *sb = mdsc->client->sb;
2646 struct inode *inode;
2647 struct ceph_cap *cap;
2648 struct ceph_mds_caps *h;
2649 int mds = session->s_mds;
2650 int op;
2651 u32 seq;
2652 struct ceph_vino vino;
2653 u64 cap_id;
2654 u64 size, max_size;
2655 u64 tid;
2656 void *snaptrace;
2657
2658 dout("handle_caps from mds%d\n", mds);
2659
2660 /* decode */
2661 tid = le64_to_cpu(msg->hdr.tid);
2662 if (msg->front.iov_len < sizeof(*h))
2663 goto bad;
2664 h = msg->front.iov_base;
2665 snaptrace = h + 1;
2666 op = le32_to_cpu(h->op);
2667 vino.ino = le64_to_cpu(h->ino);
2668 vino.snap = CEPH_NOSNAP;
2669 cap_id = le64_to_cpu(h->cap_id);
2670 seq = le32_to_cpu(h->seq);
2671 size = le64_to_cpu(h->size);
2672 max_size = le64_to_cpu(h->max_size);
2673
2674 mutex_lock(&session->s_mutex);
2675 session->s_seq++;
2676 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
2677 (unsigned)seq);
2678
2679 /* lookup ino */
2680 inode = ceph_find_inode(sb, vino);
2681 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
2682 vino.snap, inode);
2683 if (!inode) {
2684 dout(" i don't have ino %llx\n", vino.ino);
2685 goto done;
2686 }
2687
2688 /* these will work even if we don't have a cap yet */
2689 switch (op) {
2690 case CEPH_CAP_OP_FLUSHSNAP_ACK:
2691 handle_cap_flushsnap_ack(inode, tid, h, session);
2692 goto done;
2693
2694 case CEPH_CAP_OP_EXPORT:
2695 handle_cap_export(inode, h, session);
2696 goto done;
2697
2698 case CEPH_CAP_OP_IMPORT:
2699 handle_cap_import(mdsc, inode, h, session,
2700 snaptrace, le32_to_cpu(h->snap_trace_len));
2701 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
2702 session);
2703 goto done_unlocked;
2704 }
2705
2706 /* the rest require a cap */
2707 spin_lock(&inode->i_lock);
2708 cap = __get_cap_for_mds(ceph_inode(inode), mds);
2709 if (!cap) {
2710 dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
2711 inode, ceph_ino(inode), ceph_snap(inode), mds);
2712 spin_unlock(&inode->i_lock);
2713 goto done;
2714 }
2715
2716 /* note that each of these drops i_lock for us */
2717 switch (op) {
2718 case CEPH_CAP_OP_REVOKE:
2719 case CEPH_CAP_OP_GRANT:
2720 handle_cap_grant(inode, h, session, cap, msg->middle);
2721 goto done_unlocked;
2722
2723 case CEPH_CAP_OP_FLUSH_ACK:
2724 handle_cap_flush_ack(inode, tid, h, session, cap);
2725 break;
2726
2727 case CEPH_CAP_OP_TRUNC:
2728 handle_cap_trunc(inode, h, session);
2729 break;
2730
2731 default:
2732 spin_unlock(&inode->i_lock);
2733 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
2734 ceph_cap_op_name(op));
2735 }
2736
2737done:
2738 mutex_unlock(&session->s_mutex);
2739done_unlocked:
2740 if (inode)
2741 iput(inode);
2742 return;
2743
2744bad:
2745 pr_err("ceph_handle_caps: corrupt message\n");
2746 ceph_msg_dump(msg);
2747 return;
2748}
2749
2750/*
2751 * Delayed work handler to process end of delayed cap release LRU list.
2752 */
2753void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
2754{
2755 struct ceph_inode_info *ci;
2756 int flags = CHECK_CAPS_NODELAY;
2757
2758 dout("check_delayed_caps\n");
2759 while (1) {
2760 spin_lock(&mdsc->cap_delay_lock);
2761 if (list_empty(&mdsc->cap_delay_list))
2762 break;
2763 ci = list_first_entry(&mdsc->cap_delay_list,
2764 struct ceph_inode_info,
2765 i_cap_delay_list);
2766 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
2767 time_before(jiffies, ci->i_hold_caps_max))
2768 break;
2769 list_del_init(&ci->i_cap_delay_list);
2770 spin_unlock(&mdsc->cap_delay_lock);
2771 dout("check_delayed_caps on %p\n", &ci->vfs_inode);
2772 ceph_check_caps(ci, flags, NULL);
2773 }
2774 spin_unlock(&mdsc->cap_delay_lock);
2775}
2776
2777/*
2778 * Flush all dirty caps to the mds
2779 */
2780void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2781{
2782 struct ceph_inode_info *ci, *nci = NULL;
2783 struct inode *inode, *ninode = NULL;
2784 struct list_head *p, *n;
2785
2786 dout("flush_dirty_caps\n");
2787 spin_lock(&mdsc->cap_dirty_lock);
2788 list_for_each_safe(p, n, &mdsc->cap_dirty) {
2789 if (nci) {
2790 ci = nci;
2791 inode = ninode;
2792 ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
2793 dout("flush_dirty_caps inode %p (was next inode)\n",
2794 inode);
2795 } else {
2796 ci = list_entry(p, struct ceph_inode_info,
2797 i_dirty_item);
2798 inode = igrab(&ci->vfs_inode);
2799 BUG_ON(!inode);
2800 dout("flush_dirty_caps inode %p\n", inode);
2801 }
2802 if (n != &mdsc->cap_dirty) {
2803 nci = list_entry(n, struct ceph_inode_info,
2804 i_dirty_item);
2805 ninode = igrab(&nci->vfs_inode);
2806 BUG_ON(!ninode);
2807 nci->i_ceph_flags |= CEPH_I_NOFLUSH;
2808 dout("flush_dirty_caps next inode %p, noflush\n",
2809 ninode);
2810 } else {
2811 nci = NULL;
2812 ninode = NULL;
2813 }
2814 spin_unlock(&mdsc->cap_dirty_lock);
2815 if (inode) {
2816 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
2817 NULL);
2818 iput(inode);
2819 }
2820 spin_lock(&mdsc->cap_dirty_lock);
2821 }
2822 spin_unlock(&mdsc->cap_dirty_lock);
2823}
2824
2825/*
2826 * Drop open file reference. If we were the last open file,
2827 * we may need to release capabilities to the MDS (or schedule
2828 * their delayed release).
2829 */
2830void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
2831{
2832 struct inode *inode = &ci->vfs_inode;
2833 int last = 0;
2834
2835 spin_lock(&inode->i_lock);
2836 dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
2837 ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
2838 BUG_ON(ci->i_nr_by_mode[fmode] == 0);
2839 if (--ci->i_nr_by_mode[fmode] == 0)
2840 last++;
2841 spin_unlock(&inode->i_lock);
2842
2843 if (last && ci->i_vino.snap == CEPH_NOSNAP)
2844 ceph_check_caps(ci, 0, NULL);
2845}
2846
2847/*
2848 * Helpers for embedding cap and dentry lease releases into mds
2849 * requests.
2850 *
2851 * @force is used by dentry_release (below) to force inclusion of a
2852 * record for the directory inode, even when there aren't any caps to
2853 * drop.
2854 */
2855int ceph_encode_inode_release(void **p, struct inode *inode,
2856 int mds, int drop, int unless, int force)
2857{
2858 struct ceph_inode_info *ci = ceph_inode(inode);
2859 struct ceph_cap *cap;
2860 struct ceph_mds_request_release *rel = *p;
2861 int ret = 0;
2862 int used = 0;
2863
2864 spin_lock(&inode->i_lock);
2865 used = __ceph_caps_used(ci);
2866
2867 dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
2868 mds, ceph_cap_string(used), ceph_cap_string(drop),
2869 ceph_cap_string(unless));
2870
2871 /* only drop unused caps */
2872 drop &= ~used;
2873
2874 cap = __get_cap_for_mds(ci, mds);
2875 if (cap && __cap_is_valid(cap)) {
2876 if (force ||
2877 ((cap->issued & drop) &&
2878 (cap->issued & unless) == 0)) {
2879 if ((cap->issued & drop) &&
2880 (cap->issued & unless) == 0) {
2881 dout("encode_inode_release %p cap %p %s -> "
2882 "%s\n", inode, cap,
2883 ceph_cap_string(cap->issued),
2884 ceph_cap_string(cap->issued & ~drop));
2885 cap->issued &= ~drop;
2886 cap->implemented &= ~drop;
2887 if (ci->i_ceph_flags & CEPH_I_NODELAY) {
2888 int wanted = __ceph_caps_wanted(ci);
2889 dout(" wanted %s -> %s (act %s)\n",
2890 ceph_cap_string(cap->mds_wanted),
2891 ceph_cap_string(cap->mds_wanted &
2892 ~wanted),
2893 ceph_cap_string(wanted));
2894 cap->mds_wanted &= wanted;
2895 }
2896 } else {
2897 dout("encode_inode_release %p cap %p %s"
2898 " (force)\n", inode, cap,
2899 ceph_cap_string(cap->issued));
2900 }
2901
2902 rel->ino = cpu_to_le64(ceph_ino(inode));
2903 rel->cap_id = cpu_to_le64(cap->cap_id);
2904 rel->seq = cpu_to_le32(cap->seq);
2905 rel->issue_seq = cpu_to_le32(cap->issue_seq),
2906 rel->mseq = cpu_to_le32(cap->mseq);
2907 rel->caps = cpu_to_le32(cap->issued);
2908 rel->wanted = cpu_to_le32(cap->mds_wanted);
2909 rel->dname_len = 0;
2910 rel->dname_seq = 0;
2911 *p += sizeof(*rel);
2912 ret = 1;
2913 } else {
2914 dout("encode_inode_release %p cap %p %s\n",
2915 inode, cap, ceph_cap_string(cap->issued));
2916 }
2917 }
2918 spin_unlock(&inode->i_lock);
2919 return ret;
2920}
2921
2922int ceph_encode_dentry_release(void **p, struct dentry *dentry,
2923 int mds, int drop, int unless)
2924{
2925 struct inode *dir = dentry->d_parent->d_inode;
2926 struct ceph_mds_request_release *rel = *p;
2927 struct ceph_dentry_info *di = ceph_dentry(dentry);
2928 int force = 0;
2929 int ret;
2930
2931 /*
2932 * force an record for the directory caps if we have a dentry lease.
2933 * this is racy (can't take i_lock and d_lock together), but it
2934 * doesn't have to be perfect; the mds will revoke anything we don't
2935 * release.
2936 */
2937 spin_lock(&dentry->d_lock);
2938 if (di->lease_session && di->lease_session->s_mds == mds)
2939 force = 1;
2940 spin_unlock(&dentry->d_lock);
2941
2942 ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
2943
2944 spin_lock(&dentry->d_lock);
2945 if (ret && di->lease_session && di->lease_session->s_mds == mds) {
2946 dout("encode_dentry_release %p mds%d seq %d\n",
2947 dentry, mds, (int)di->lease_seq);
2948 rel->dname_len = cpu_to_le32(dentry->d_name.len);
2949 memcpy(*p, dentry->d_name.name, dentry->d_name.len);
2950 *p += dentry->d_name.len;
2951 rel->dname_seq = cpu_to_le32(di->lease_seq);
2952 }
2953 spin_unlock(&dentry->d_lock);
2954 return ret;
2955}
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644
index 000000000000..1818c2305610
--- /dev/null
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
1#ifndef _FS_CEPH_DEBUG_H
2#define _FS_CEPH_DEBUG_H
3
4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5
6#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
7
8/*
9 * wrap pr_debug to include a filename:lineno prefix on each line.
10 * this incurs some overhead (kernel size and execution time) due to
11 * the extra function call at each call site.
12 */
13
14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
15extern const char *ceph_file_part(const char *s, int len);
16# define dout(fmt, ...) \
17 pr_debug(" %12.12s:%-4d : " fmt, \
18 ceph_file_part(__FILE__, sizeof(__FILE__)), \
19 __LINE__, ##__VA_ARGS__)
20# else
21/* faux printk call just to see any compiler warnings. */
22# define dout(fmt, ...) do { \
23 if (0) \
24 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
25 } while (0)
26# endif
27
28#else
29
30/*
31 * or, just wrap pr_debug
32 */
33# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
34
35#endif
36
37#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 000000000000..ab6cf35c4091
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
1/*
2 * Ceph 'frag' type
3 */
4#include "types.h"
5
6int ceph_frag_compare(__u32 a, __u32 b)
7{
8 unsigned va = ceph_frag_value(a);
9 unsigned vb = ceph_frag_value(b);
10 if (va < vb)
11 return -1;
12 if (va > vb)
13 return 1;
14 va = ceph_frag_bits(a);
15 vb = ceph_frag_bits(b);
16 if (va < vb)
17 return -1;
18 if (va > vb)
19 return 1;
20 return 0;
21}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644
index 000000000000..793f50cb7c22
--- /dev/null
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
1#ifndef _FS_CEPH_FRAG_H
2#define _FS_CEPH_FRAG_H
3
4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space,
6 * using a mask and a value to match against that mask. Any given frag
7 * (subset of the number space) can be partitioned into 2^n sub-frags.
8 *
9 * Frags are encoded into a 32-bit word:
10 * 8 upper bits = "bits"
11 * 24 lower bits = "value"
12 * (We could go to 5+27 bits, but who cares.)
13 *
14 * We use the _most_ significant bits of the 24 bit value. This makes
15 * values logically sort.
16 *
17 * Unfortunately, because the "bits" field is still in the high bits, we
18 * can't sort encoded frags numerically. However, it does allow you
19 * to feed encoded frags as values into frag_contains_value.
20 */
21static inline __u32 ceph_frag_make(__u32 b, __u32 v)
22{
23 return (b << 24) |
24 (v & (0xffffffu << (24-b)) & 0xffffffu);
25}
26static inline __u32 ceph_frag_bits(__u32 f)
27{
28 return f >> 24;
29}
30static inline __u32 ceph_frag_value(__u32 f)
31{
32 return f & 0xffffffu;
33}
34static inline __u32 ceph_frag_mask(__u32 f)
35{
36 return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
37}
38static inline __u32 ceph_frag_mask_shift(__u32 f)
39{
40 return 24 - ceph_frag_bits(f);
41}
42
43static inline int ceph_frag_contains_value(__u32 f, __u32 v)
44{
45 return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
46}
47static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
48{
49 /* is sub as specific as us, and contained by us? */
50 return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
51 (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
52}
53
54static inline __u32 ceph_frag_parent(__u32 f)
55{
56 return ceph_frag_make(ceph_frag_bits(f) - 1,
57 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
58}
59static inline int ceph_frag_is_left_child(__u32 f)
60{
61 return ceph_frag_bits(f) > 0 &&
62 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
63}
64static inline int ceph_frag_is_right_child(__u32 f)
65{
66 return ceph_frag_bits(f) > 0 &&
67 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
68}
69static inline __u32 ceph_frag_sibling(__u32 f)
70{
71 return ceph_frag_make(ceph_frag_bits(f),
72 ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
73}
74static inline __u32 ceph_frag_left_child(__u32 f)
75{
76 return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
77}
78static inline __u32 ceph_frag_right_child(__u32 f)
79{
80 return ceph_frag_make(ceph_frag_bits(f)+1,
81 ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
82}
83static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
84{
85 int newbits = ceph_frag_bits(f) + by;
86 return ceph_frag_make(newbits,
87 ceph_frag_value(f) | (i << (24 - newbits)));
88}
89static inline int ceph_frag_is_leftmost(__u32 f)
90{
91 return ceph_frag_value(f) == 0;
92}
93static inline int ceph_frag_is_rightmost(__u32 f)
94{
95 return ceph_frag_value(f) == ceph_frag_mask(f);
96}
97static inline __u32 ceph_frag_next(__u32 f)
98{
99 return ceph_frag_make(ceph_frag_bits(f),
100 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
101}
102
103/*
104 * comparator to sort frags logically, as when traversing the
105 * number space in ascending order...
106 */
107int ceph_frag_compare(__u32 a, __u32 b);
108
109#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 000000000000..79d76bc4303f
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,74 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include "types.h"
5
6/*
7 * return true if @layout appears to be valid
8 */
9int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
10{
11 __u32 su = le32_to_cpu(layout->fl_stripe_unit);
12 __u32 sc = le32_to_cpu(layout->fl_stripe_count);
13 __u32 os = le32_to_cpu(layout->fl_object_size);
14
15 /* stripe unit, object size must be non-zero, 64k increment */
16 if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
17 return 0;
18 if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
19 return 0;
20 /* object size must be a multiple of stripe unit */
21 if (os < su || os % su)
22 return 0;
23 /* stripe count must be non-zero */
24 if (!sc)
25 return 0;
26 return 1;
27}
28
29
30int ceph_flags_to_mode(int flags)
31{
32#ifdef O_DIRECTORY /* fixme */
33 if ((flags & O_DIRECTORY) == O_DIRECTORY)
34 return CEPH_FILE_MODE_PIN;
35#endif
36#ifdef O_LAZY
37 if (flags & O_LAZY)
38 return CEPH_FILE_MODE_LAZY;
39#endif
40 if ((flags & O_APPEND) == O_APPEND)
41 flags |= O_WRONLY;
42
43 flags &= O_ACCMODE;
44 if ((flags & O_RDWR) == O_RDWR)
45 return CEPH_FILE_MODE_RDWR;
46 if ((flags & O_WRONLY) == O_WRONLY)
47 return CEPH_FILE_MODE_WR;
48 return CEPH_FILE_MODE_RD;
49}
50
51int ceph_caps_for_mode(int mode)
52{
53 switch (mode) {
54 case CEPH_FILE_MODE_PIN:
55 return CEPH_CAP_PIN;
56 case CEPH_FILE_MODE_RD:
57 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
58 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
59 case CEPH_FILE_MODE_RDWR:
60 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
61 CEPH_CAP_FILE_EXCL |
62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
63 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
64 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
65 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
66 case CEPH_FILE_MODE_WR:
67 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
68 CEPH_CAP_FILE_EXCL |
69 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
70 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
71 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
72 }
73 return 0;
74}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 000000000000..0c2241ef3653
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,650 @@
1/*
2 * ceph_fs.h - Ceph constants and data types to share between kernel and
3 * user space.
4 *
5 * Most types in this file are defined as little-endian, and are
6 * primarily intended to describe data structures that pass over the
7 * wire or that are stored on disk.
8 *
9 * LGPL2
10 */
11
12#ifndef _FS_CEPH_CEPH_FS_H
13#define _FS_CEPH_CEPH_FS_H
14
15#include "msgr.h"
16#include "rados.h"
17
18/*
19 * Ceph release version
20 */
21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 19
23#define CEPH_VERSION_PATCH 0
24
25#define _CEPH_STRINGIFY(x) #x
26#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
27#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
28 "." CEPH_STRINGIFY(z)
29#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
30 CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
31
32/*
33 * subprotocol versions. when specific messages types or high-level
34 * protocols change, bump the affected components. we keep rev
35 * internal cluster protocols separately from the public,
36 * client-facing protocol.
37 */
38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 9 /* cluster internal */
40#define CEPH_MON_PROTOCOL 5 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 24 /* server/client */
42#define CEPH_MDSC_PROTOCOL 32 /* server/client */
43#define CEPH_MONC_PROTOCOL 15 /* server/client */
44
45
46#define CEPH_INO_ROOT 1
47#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
48
49/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
50#define CEPH_MAX_MON 31
51
52
53/*
54 * feature bits
55 */
56#define CEPH_FEATURE_SUPPORTED 0
57#define CEPH_FEATURE_REQUIRED 0
58
59
60/*
61 * ceph_file_layout - describe data layout for a file/inode
62 */
63struct ceph_file_layout {
64 /* file -> object mapping */
65 __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
66 of page size. */
67 __le32 fl_stripe_count; /* over this many objects */
68 __le32 fl_object_size; /* until objects are this big, then move to
69 new objects */
70 __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
71
72 /* pg -> disk layout */
73 __le32 fl_object_stripe_unit; /* for per-object parity, if any */
74
75 /* object -> pg layout */
76 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
77 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
78} __attribute__ ((packed));
79
80#define CEPH_MIN_STRIPE_UNIT 65536
81
82int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
83
84
85/* crypto algorithms */
86#define CEPH_CRYPTO_NONE 0x0
87#define CEPH_CRYPTO_AES 0x1
88
89/* security/authentication protocols */
90#define CEPH_AUTH_UNKNOWN 0x0
91#define CEPH_AUTH_NONE 0x1
92#define CEPH_AUTH_CEPHX 0x2
93
94
95/*********************************************
96 * message layer
97 */
98
99/*
100 * message types
101 */
102
103/* misc */
104#define CEPH_MSG_SHUTDOWN 1
105#define CEPH_MSG_PING 2
106
107/* client <-> monitor */
108#define CEPH_MSG_MON_MAP 4
109#define CEPH_MSG_MON_GET_MAP 5
110#define CEPH_MSG_STATFS 13
111#define CEPH_MSG_STATFS_REPLY 14
112#define CEPH_MSG_MON_SUBSCRIBE 15
113#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
114#define CEPH_MSG_AUTH 17
115#define CEPH_MSG_AUTH_REPLY 18
116
117/* client <-> mds */
118#define CEPH_MSG_MDS_MAP 21
119
120#define CEPH_MSG_CLIENT_SESSION 22
121#define CEPH_MSG_CLIENT_RECONNECT 23
122
123#define CEPH_MSG_CLIENT_REQUEST 24
124#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
125#define CEPH_MSG_CLIENT_REPLY 26
126#define CEPH_MSG_CLIENT_CAPS 0x310
127#define CEPH_MSG_CLIENT_LEASE 0x311
128#define CEPH_MSG_CLIENT_SNAP 0x312
129#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
130
131/* osd */
132#define CEPH_MSG_OSD_MAP 41
133#define CEPH_MSG_OSD_OP 42
134#define CEPH_MSG_OSD_OPREPLY 43
135
136struct ceph_mon_request_header {
137 __le64 have_version;
138 __le16 session_mon;
139 __le64 session_mon_tid;
140} __attribute__ ((packed));
141
142struct ceph_mon_statfs {
143 struct ceph_mon_request_header monhdr;
144 struct ceph_fsid fsid;
145} __attribute__ ((packed));
146
147struct ceph_statfs {
148 __le64 kb, kb_used, kb_avail;
149 __le64 num_objects;
150} __attribute__ ((packed));
151
152struct ceph_mon_statfs_reply {
153 struct ceph_fsid fsid;
154 __le64 version;
155 struct ceph_statfs st;
156} __attribute__ ((packed));
157
158struct ceph_osd_getmap {
159 struct ceph_mon_request_header monhdr;
160 struct ceph_fsid fsid;
161 __le32 start;
162} __attribute__ ((packed));
163
164struct ceph_mds_getmap {
165 struct ceph_mon_request_header monhdr;
166 struct ceph_fsid fsid;
167} __attribute__ ((packed));
168
169struct ceph_client_mount {
170 struct ceph_mon_request_header monhdr;
171} __attribute__ ((packed));
172
173struct ceph_mon_subscribe_item {
174 __le64 have_version; __le64 have;
175 __u8 onetime;
176} __attribute__ ((packed));
177
178struct ceph_mon_subscribe_ack {
179 __le32 duration; /* seconds */
180 struct ceph_fsid fsid;
181} __attribute__ ((packed));
182
183/*
184 * mds states
185 * > 0 -> in
186 * <= 0 -> out
187 */
188#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
189#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
190 empty log. */
191#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
192#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
193#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
194#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
195#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
196
197#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
198#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
199 operations (import, rename, etc.) */
200#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
201#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
202#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
203#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
204#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
205
206extern const char *ceph_mds_state_name(int s);
207
208
209/*
210 * metadata lock types.
211 * - these are bitmasks.. we can compose them
212 * - they also define the lock ordering by the MDS
213 * - a few of these are internal to the mds
214 */
215#define CEPH_LOCK_DN 1
216#define CEPH_LOCK_ISNAP 2
217#define CEPH_LOCK_IVERSION 4 /* mds internal */
218#define CEPH_LOCK_IFILE 8 /* mds internal */
219#define CEPH_LOCK_IAUTH 32
220#define CEPH_LOCK_ILINK 64
221#define CEPH_LOCK_IDFT 128 /* dir frag tree */
222#define CEPH_LOCK_INEST 256 /* mds internal */
223#define CEPH_LOCK_IXATTR 512
224#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */
225
226/* client_session ops */
227enum {
228 CEPH_SESSION_REQUEST_OPEN,
229 CEPH_SESSION_OPEN,
230 CEPH_SESSION_REQUEST_CLOSE,
231 CEPH_SESSION_CLOSE,
232 CEPH_SESSION_REQUEST_RENEWCAPS,
233 CEPH_SESSION_RENEWCAPS,
234 CEPH_SESSION_STALE,
235 CEPH_SESSION_RECALL_STATE,
236};
237
238extern const char *ceph_session_op_name(int op);
239
240struct ceph_mds_session_head {
241 __le32 op;
242 __le64 seq;
243 struct ceph_timespec stamp;
244 __le32 max_caps, max_leases;
245} __attribute__ ((packed));
246
247/* client_request */
248/*
249 * metadata ops.
250 * & 0x001000 -> write op
251 * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
252 & & 0x100000 -> use weird ino/path trace
253 */
254#define CEPH_MDS_OP_WRITE 0x001000
255enum {
256 CEPH_MDS_OP_LOOKUP = 0x00100,
257 CEPH_MDS_OP_GETATTR = 0x00101,
258 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
259 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
260
261 CEPH_MDS_OP_SETXATTR = 0x01105,
262 CEPH_MDS_OP_RMXATTR = 0x01106,
263 CEPH_MDS_OP_SETLAYOUT = 0x01107,
264 CEPH_MDS_OP_SETATTR = 0x01108,
265
266 CEPH_MDS_OP_MKNOD = 0x01201,
267 CEPH_MDS_OP_LINK = 0x01202,
268 CEPH_MDS_OP_UNLINK = 0x01203,
269 CEPH_MDS_OP_RENAME = 0x01204,
270 CEPH_MDS_OP_MKDIR = 0x01220,
271 CEPH_MDS_OP_RMDIR = 0x01221,
272 CEPH_MDS_OP_SYMLINK = 0x01222,
273
274 CEPH_MDS_OP_CREATE = 0x01301,
275 CEPH_MDS_OP_OPEN = 0x00302,
276 CEPH_MDS_OP_READDIR = 0x00305,
277
278 CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
279 CEPH_MDS_OP_MKSNAP = 0x01400,
280 CEPH_MDS_OP_RMSNAP = 0x01401,
281 CEPH_MDS_OP_LSSNAP = 0x00402,
282};
283
284extern const char *ceph_mds_op_name(int op);
285
286
287#define CEPH_SETATTR_MODE 1
288#define CEPH_SETATTR_UID 2
289#define CEPH_SETATTR_GID 4
290#define CEPH_SETATTR_MTIME 8
291#define CEPH_SETATTR_ATIME 16
292#define CEPH_SETATTR_SIZE 32
293#define CEPH_SETATTR_CTIME 64
294
295union ceph_mds_request_args {
296 struct {
297 __le32 mask; /* CEPH_CAP_* */
298 } __attribute__ ((packed)) getattr;
299 struct {
300 __le32 mode;
301 __le32 uid;
302 __le32 gid;
303 struct ceph_timespec mtime;
304 struct ceph_timespec atime;
305 __le64 size, old_size; /* old_size needed by truncate */
306 __le32 mask; /* CEPH_SETATTR_* */
307 } __attribute__ ((packed)) setattr;
308 struct {
309 __le32 frag; /* which dir fragment */
310 __le32 max_entries; /* how many dentries to grab */
311 } __attribute__ ((packed)) readdir;
312 struct {
313 __le32 mode;
314 __le32 rdev;
315 } __attribute__ ((packed)) mknod;
316 struct {
317 __le32 mode;
318 } __attribute__ ((packed)) mkdir;
319 struct {
320 __le32 flags;
321 __le32 mode;
322 __le32 stripe_unit; /* layout for newly created file */
323 __le32 stripe_count; /* ... */
324 __le32 object_size;
325 __le32 file_replication;
326 __le32 preferred;
327 } __attribute__ ((packed)) open;
328 struct {
329 __le32 flags;
330 } __attribute__ ((packed)) setxattr;
331 struct {
332 struct ceph_file_layout layout;
333 } __attribute__ ((packed)) setlayout;
334} __attribute__ ((packed));
335
336#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
337#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
338
339struct ceph_mds_request_head {
340 __le64 oldest_client_tid;
341 __le32 mdsmap_epoch; /* on client */
342 __le32 flags; /* CEPH_MDS_FLAG_* */
343 __u8 num_retry, num_fwd; /* count retry, fwd attempts */
344 __le16 num_releases; /* # include cap/lease release records */
345 __le32 op; /* mds op code */
346 __le32 caller_uid, caller_gid;
347 __le64 ino; /* use this ino for openc, mkdir, mknod,
348 etc. (if replaying) */
349 union ceph_mds_request_args args;
350} __attribute__ ((packed));
351
352/* cap/lease release record */
353struct ceph_mds_request_release {
354 __le64 ino, cap_id; /* ino and unique cap id */
355 __le32 caps, wanted; /* new issued, wanted */
356 __le32 seq, issue_seq, mseq;
357 __le32 dname_seq; /* if releasing a dentry lease, a */
358 __le32 dname_len; /* string follows. */
359} __attribute__ ((packed));
360
361/* client reply */
362struct ceph_mds_reply_head {
363 __le32 op;
364 __le32 result;
365 __le32 mdsmap_epoch;
366 __u8 safe; /* true if committed to disk */
367 __u8 is_dentry, is_target; /* true if dentry, target inode records
368 are included with reply */
369} __attribute__ ((packed));
370
371/* one for each node split */
372struct ceph_frag_tree_split {
373 __le32 frag; /* this frag splits... */
374 __le32 by; /* ...by this many bits */
375} __attribute__ ((packed));
376
377struct ceph_frag_tree_head {
378 __le32 nsplits; /* num ceph_frag_tree_split records */
379 struct ceph_frag_tree_split splits[];
380} __attribute__ ((packed));
381
382/* capability issue, for bundling with mds reply */
383struct ceph_mds_reply_cap {
384 __le32 caps, wanted; /* caps issued, wanted */
385 __le64 cap_id;
386 __le32 seq, mseq;
387 __le64 realm; /* snap realm */
388 __u8 flags; /* CEPH_CAP_FLAG_* */
389} __attribute__ ((packed));
390
391#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
392
393/* inode record, for bundling with mds reply */
394struct ceph_mds_reply_inode {
395 __le64 ino;
396 __le64 snapid;
397 __le32 rdev;
398 __le64 version; /* inode version */
399 __le64 xattr_version; /* version for xattr blob */
400 struct ceph_mds_reply_cap cap; /* caps issued for this inode */
401 struct ceph_file_layout layout;
402 struct ceph_timespec ctime, mtime, atime;
403 __le32 time_warp_seq;
404 __le64 size, max_size, truncate_size;
405 __le32 truncate_seq;
406 __le32 mode, uid, gid;
407 __le32 nlink;
408 __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
409 struct ceph_timespec rctime;
410 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
411} __attribute__ ((packed));
412/* followed by frag array, then symlink string, then xattr blob */
413
414/* reply_lease follows dname, and reply_inode */
415struct ceph_mds_reply_lease {
416 __le16 mask; /* lease type(s) */
417 __le32 duration_ms; /* lease duration */
418 __le32 seq;
419} __attribute__ ((packed));
420
421struct ceph_mds_reply_dirfrag {
422 __le32 frag; /* fragment */
423 __le32 auth; /* auth mds, if this is a delegation point */
424 __le32 ndist; /* number of mds' this is replicated on */
425 __le32 dist[];
426} __attribute__ ((packed));
427
428/* file access modes */
429#define CEPH_FILE_MODE_PIN 0
430#define CEPH_FILE_MODE_RD 1
431#define CEPH_FILE_MODE_WR 2
432#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
433#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
434#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
435
436int ceph_flags_to_mode(int flags);
437
438
439/* capability bits */
440#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
441
442/* generic cap bits */
443#define CEPH_CAP_GSHARED 1 /* client can reads */
444#define CEPH_CAP_GEXCL 2 /* client can read and update */
445#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
446#define CEPH_CAP_GRD 8 /* (file) client can read */
447#define CEPH_CAP_GWR 16 /* (file) client can write */
448#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
449#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
450#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
451
452/* per-lock shift */
453#define CEPH_CAP_SAUTH 2
454#define CEPH_CAP_SLINK 4
455#define CEPH_CAP_SXATTR 6
456#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */
457
458#define CEPH_CAP_BITS 16
459
460/* composed values */
461#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
462#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
463#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
464#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
465#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
466#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
467#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
468#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
469#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
470#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
471#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
472#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
473#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
474#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
475#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
476
477/* cap masks (for getattr) */
478#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
479#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
480#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
481#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
482#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
483#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
484#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
485#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
486#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
487#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
488#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
489#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
490#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
491 CEPH_CAP_AUTH_SHARED | \
492 CEPH_CAP_LINK_SHARED | \
493 CEPH_CAP_FILE_SHARED | \
494 CEPH_CAP_XATTR_SHARED)
495
496#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
497 CEPH_CAP_LINK_SHARED | \
498 CEPH_CAP_XATTR_SHARED | \
499 CEPH_CAP_FILE_SHARED)
500#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
501 CEPH_CAP_FILE_CACHE)
502
503#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
504 CEPH_CAP_LINK_EXCL | \
505 CEPH_CAP_XATTR_EXCL | \
506 CEPH_CAP_FILE_EXCL)
507#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
508 CEPH_CAP_FILE_EXCL)
509#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
510#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
511 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
512
513#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
514 CEPH_LOCK_IXATTR)
515
516int ceph_caps_for_mode(int mode);
517
518enum {
519 CEPH_CAP_OP_GRANT, /* mds->client grant */
520 CEPH_CAP_OP_REVOKE, /* mds->client revoke */
521 CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
522 CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
523 CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
524 CEPH_CAP_OP_UPDATE, /* client->mds update */
525 CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
526 CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
527 CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
528 CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
529 CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
530 CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
531 CEPH_CAP_OP_RENEW, /* client->mds renewal request */
532};
533
534extern const char *ceph_cap_op_name(int op);
535
536/*
537 * caps message, used for capability callbacks, acks, requests, etc.
538 */
539struct ceph_mds_caps {
540 __le32 op; /* CEPH_CAP_OP_* */
541 __le64 ino, realm;
542 __le64 cap_id;
543 __le32 seq, issue_seq;
544 __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
545 __le32 migrate_seq;
546 __le64 snap_follows;
547 __le32 snap_trace_len;
548
549 /* authlock */
550 __le32 uid, gid, mode;
551
552 /* linklock */
553 __le32 nlink;
554
555 /* xattrlock */
556 __le32 xattr_len;
557 __le64 xattr_version;
558
559 /* filelock */
560 __le64 size, max_size, truncate_size;
561 __le32 truncate_seq;
562 struct ceph_timespec mtime, atime, ctime;
563 struct ceph_file_layout layout;
564 __le32 time_warp_seq;
565} __attribute__ ((packed));
566
567/* cap release msg head */
568struct ceph_mds_cap_release {
569 __le32 num; /* number of cap_items that follow */
570} __attribute__ ((packed));
571
572struct ceph_mds_cap_item {
573 __le64 ino;
574 __le64 cap_id;
575 __le32 migrate_seq, seq;
576} __attribute__ ((packed));
577
578#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
579#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
580#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
581#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
582
583extern const char *ceph_lease_op_name(int o);
584
585/* lease msg header */
586struct ceph_mds_lease {
587 __u8 action; /* CEPH_MDS_LEASE_* */
588 __le16 mask; /* which lease */
589 __le64 ino;
590 __le64 first, last; /* snap range */
591 __le32 seq;
592 __le32 duration_ms; /* duration of renewal */
593} __attribute__ ((packed));
594/* followed by a __le32+string for dname */
595
596/* client reconnect */
597struct ceph_mds_cap_reconnect {
598 __le64 cap_id;
599 __le32 wanted;
600 __le32 issued;
601 __le64 size;
602 struct ceph_timespec mtime, atime;
603 __le64 snaprealm;
604 __le64 pathbase; /* base ino for our path to this ino */
605} __attribute__ ((packed));
606/* followed by encoded string */
607
608struct ceph_mds_snaprealm_reconnect {
609 __le64 ino; /* snap realm base */
610 __le64 seq; /* snap seq for this snap realm */
611 __le64 parent; /* parent realm */
612} __attribute__ ((packed));
613
614/*
615 * snaps
616 */
617enum {
618 CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
619 CEPH_SNAP_OP_CREATE,
620 CEPH_SNAP_OP_DESTROY,
621 CEPH_SNAP_OP_SPLIT,
622};
623
624extern const char *ceph_snap_op_name(int o);
625
626/* snap msg header */
627struct ceph_mds_snap_head {
628 __le32 op; /* CEPH_SNAP_OP_* */
629 __le64 split; /* ino to split off, if any */
630 __le32 num_split_inos; /* # inos belonging to new child realm */
631 __le32 num_split_realms; /* # child realms udner new child realm */
632 __le32 trace_len; /* size of snap trace blob */
633} __attribute__ ((packed));
634/* followed by split ino list, then split realms, then the trace blob */
635
636/*
637 * encode info about a snaprealm, as viewed by a client
638 */
639struct ceph_mds_snap_realm {
640 __le64 ino; /* ino */
641 __le64 created; /* snap: when created */
642 __le64 parent; /* ino: parent realm */
643 __le64 parent_since; /* snap: same parent since */
644 __le64 seq; /* snap: version */
645 __le32 num_snaps;
646 __le32 num_prior_parent_snaps;
647} __attribute__ ((packed));
648/* followed by my snap list, then prior parent snap list */
649
650#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644
index 000000000000..bd570015d147
--- /dev/null
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
1
2#include "types.h"
3
4/*
5 * Robert Jenkin's hash function.
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * This is in the public domain.
8 */
9#define mix(a, b, c) \
10 do { \
11 a = a - b; a = a - c; a = a ^ (c >> 13); \
12 b = b - c; b = b - a; b = b ^ (a << 8); \
13 c = c - a; c = c - b; c = c ^ (b >> 13); \
14 a = a - b; a = a - c; a = a ^ (c >> 12); \
15 b = b - c; b = b - a; b = b ^ (a << 16); \
16 c = c - a; c = c - b; c = c ^ (b >> 5); \
17 a = a - b; a = a - c; a = a ^ (c >> 3); \
18 b = b - c; b = b - a; b = b ^ (a << 10); \
19 c = c - a; c = c - b; c = c ^ (b >> 15); \
20 } while (0)
21
22unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
23{
24 const unsigned char *k = (const unsigned char *)str;
25 __u32 a, b, c; /* the internal state */
26 __u32 len; /* how many key bytes still need mixing */
27
28 /* Set up the internal state */
29 len = length;
30 a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
31 b = a;
32 c = 0; /* variable initialization of internal state */
33
34 /* handle most of the key */
35 while (len >= 12) {
36 a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
37 ((__u32)k[3] << 24));
38 b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
39 ((__u32)k[7] << 24));
40 c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
41 ((__u32)k[11] << 24));
42 mix(a, b, c);
43 k = k + 12;
44 len = len - 12;
45 }
46
47 /* handle the last 11 bytes */
48 c = c + length;
49 switch (len) { /* all the case statements fall through */
50 case 11:
51 c = c + ((__u32)k[10] << 24);
52 case 10:
53 c = c + ((__u32)k[9] << 16);
54 case 9:
55 c = c + ((__u32)k[8] << 8);
56 /* the first byte of c is reserved for the length */
57 case 8:
58 b = b + ((__u32)k[7] << 24);
59 case 7:
60 b = b + ((__u32)k[6] << 16);
61 case 6:
62 b = b + ((__u32)k[5] << 8);
63 case 5:
64 b = b + k[4];
65 case 4:
66 a = a + ((__u32)k[3] << 24);
67 case 3:
68 a = a + ((__u32)k[2] << 16);
69 case 2:
70 a = a + ((__u32)k[1] << 8);
71 case 1:
72 a = a + k[0];
73 /* case 0: nothing left to add */
74 }
75 mix(a, b, c);
76
77 return c;
78}
79
80/*
81 * linux dcache hash
82 */
83unsigned ceph_str_hash_linux(const char *str, unsigned length)
84{
85 unsigned long hash = 0;
86 unsigned char c;
87
88 while (length--) {
89 c = *str++;
90 hash = (hash + (c << 4) + (c >> 4)) * 11;
91 }
92 return hash;
93}
94
95
96unsigned ceph_str_hash(int type, const char *s, unsigned len)
97{
98 switch (type) {
99 case CEPH_STR_HASH_LINUX:
100 return ceph_str_hash_linux(s, len);
101 case CEPH_STR_HASH_RJENKINS:
102 return ceph_str_hash_rjenkins(s, len);
103 default:
104 return -1;
105 }
106}
107
108const char *ceph_str_hash_name(int type)
109{
110 switch (type) {
111 case CEPH_STR_HASH_LINUX:
112 return "linux";
113 case CEPH_STR_HASH_RJENKINS:
114 return "rjenkins";
115 default:
116 return "unknown";
117 }
118}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644
index 000000000000..5ac470c433c9
--- /dev/null
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
1#ifndef _FS_CEPH_HASH_H
2#define _FS_CEPH_HASH_H
3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
6
7extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
8extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
9
10extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
11extern const char *ceph_str_hash_name(int type);
12
13#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 000000000000..8e4be6a80c62
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,176 @@
1/*
2 * Ceph string constants
3 */
4#include "types.h"
5
6const char *ceph_entity_type_name(int type)
7{
8 switch (type) {
9 case CEPH_ENTITY_TYPE_MDS: return "mds";
10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown";
16 }
17}
18
19const char *ceph_osd_op_name(int op)
20{
21 switch (op) {
22 case CEPH_OSD_OP_READ: return "read";
23 case CEPH_OSD_OP_STAT: return "stat";
24
25 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
26
27 case CEPH_OSD_OP_WRITE: return "write";
28 case CEPH_OSD_OP_DELETE: return "delete";
29 case CEPH_OSD_OP_TRUNCATE: return "truncate";
30 case CEPH_OSD_OP_ZERO: return "zero";
31 case CEPH_OSD_OP_WRITEFULL: return "writefull";
32
33 case CEPH_OSD_OP_APPEND: return "append";
34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
35 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
36 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
37
38 case CEPH_OSD_OP_TMAPUP: return "tmapup";
39 case CEPH_OSD_OP_TMAPGET: return "tmapget";
40 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
41
42 case CEPH_OSD_OP_GETXATTR: return "getxattr";
43 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
44 case CEPH_OSD_OP_SETXATTR: return "setxattr";
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
48
49 case CEPH_OSD_OP_PULL: return "pull";
50 case CEPH_OSD_OP_PUSH: return "push";
51 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
52 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
53 case CEPH_OSD_OP_SCRUB: return "scrub";
54
55 case CEPH_OSD_OP_WRLOCK: return "wrlock";
56 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
57 case CEPH_OSD_OP_RDLOCK: return "rdlock";
58 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
59 case CEPH_OSD_OP_UPLOCK: return "uplock";
60 case CEPH_OSD_OP_DNLOCK: return "dnlock";
61
62 case CEPH_OSD_OP_CALL: return "call";
63
64 case CEPH_OSD_OP_PGLS: return "pgls";
65 }
66 return "???";
67}
68
69const char *ceph_mds_state_name(int s)
70{
71 switch (s) {
72 /* down and out */
73 case CEPH_MDS_STATE_DNE: return "down:dne";
74 case CEPH_MDS_STATE_STOPPED: return "down:stopped";
75 /* up and out */
76 case CEPH_MDS_STATE_BOOT: return "up:boot";
77 case CEPH_MDS_STATE_STANDBY: return "up:standby";
78 case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
79 case CEPH_MDS_STATE_CREATING: return "up:creating";
80 case CEPH_MDS_STATE_STARTING: return "up:starting";
81 /* up and in */
82 case CEPH_MDS_STATE_REPLAY: return "up:replay";
83 case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
84 case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
85 case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
86 case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
87 case CEPH_MDS_STATE_ACTIVE: return "up:active";
88 case CEPH_MDS_STATE_STOPPING: return "up:stopping";
89 }
90 return "???";
91}
92
93const char *ceph_session_op_name(int op)
94{
95 switch (op) {
96 case CEPH_SESSION_REQUEST_OPEN: return "request_open";
97 case CEPH_SESSION_OPEN: return "open";
98 case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
99 case CEPH_SESSION_CLOSE: return "close";
100 case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
101 case CEPH_SESSION_RENEWCAPS: return "renewcaps";
102 case CEPH_SESSION_STALE: return "stale";
103 case CEPH_SESSION_RECALL_STATE: return "recall_state";
104 }
105 return "???";
106}
107
108const char *ceph_mds_op_name(int op)
109{
110 switch (op) {
111 case CEPH_MDS_OP_LOOKUP: return "lookup";
112 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
113 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
114 case CEPH_MDS_OP_GETATTR: return "getattr";
115 case CEPH_MDS_OP_SETXATTR: return "setxattr";
116 case CEPH_MDS_OP_SETATTR: return "setattr";
117 case CEPH_MDS_OP_RMXATTR: return "rmxattr";
118 case CEPH_MDS_OP_READDIR: return "readdir";
119 case CEPH_MDS_OP_MKNOD: return "mknod";
120 case CEPH_MDS_OP_LINK: return "link";
121 case CEPH_MDS_OP_UNLINK: return "unlink";
122 case CEPH_MDS_OP_RENAME: return "rename";
123 case CEPH_MDS_OP_MKDIR: return "mkdir";
124 case CEPH_MDS_OP_RMDIR: return "rmdir";
125 case CEPH_MDS_OP_SYMLINK: return "symlink";
126 case CEPH_MDS_OP_CREATE: return "create";
127 case CEPH_MDS_OP_OPEN: return "open";
128 case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
129 case CEPH_MDS_OP_LSSNAP: return "lssnap";
130 case CEPH_MDS_OP_MKSNAP: return "mksnap";
131 case CEPH_MDS_OP_RMSNAP: return "rmsnap";
132 }
133 return "???";
134}
135
136const char *ceph_cap_op_name(int op)
137{
138 switch (op) {
139 case CEPH_CAP_OP_GRANT: return "grant";
140 case CEPH_CAP_OP_REVOKE: return "revoke";
141 case CEPH_CAP_OP_TRUNC: return "trunc";
142 case CEPH_CAP_OP_EXPORT: return "export";
143 case CEPH_CAP_OP_IMPORT: return "import";
144 case CEPH_CAP_OP_UPDATE: return "update";
145 case CEPH_CAP_OP_DROP: return "drop";
146 case CEPH_CAP_OP_FLUSH: return "flush";
147 case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
148 case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
149 case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
150 case CEPH_CAP_OP_RELEASE: return "release";
151 case CEPH_CAP_OP_RENEW: return "renew";
152 }
153 return "???";
154}
155
156const char *ceph_lease_op_name(int o)
157{
158 switch (o) {
159 case CEPH_MDS_LEASE_REVOKE: return "revoke";
160 case CEPH_MDS_LEASE_RELEASE: return "release";
161 case CEPH_MDS_LEASE_RENEW: return "renew";
162 case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
163 }
164 return "???";
165}
166
167const char *ceph_snap_op_name(int o)
168{
169 switch (o) {
170 case CEPH_SNAP_OP_UPDATE: return "update";
171 case CEPH_SNAP_OP_CREATE: return "create";
172 case CEPH_SNAP_OP_DESTROY: return "destroy";
173 case CEPH_SNAP_OP_SPLIT: return "split";
174 }
175 return "???";
176}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 000000000000..fabd302e5779
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,151 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include "crush.h"
12
13const char *crush_bucket_alg_name(int alg)
14{
15 switch (alg) {
16 case CRUSH_BUCKET_UNIFORM: return "uniform";
17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw";
20 default: return "unknown";
21 }
22}
23
24/**
25 * crush_get_bucket_item_weight - Get weight of an item in given bucket
26 * @b: bucket pointer
27 * @p: item index in bucket
28 */
29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{
31 if (p >= b->size)
32 return 0;
33
34 switch (b->alg) {
35 case CRUSH_BUCKET_UNIFORM:
36 return ((struct crush_bucket_uniform *)b)->item_weight;
37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE:
40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
43 case CRUSH_BUCKET_STRAW:
44 return ((struct crush_bucket_straw *)b)->item_weights[p];
45 }
46 return 0;
47}
48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
73{
74 kfree(b->h.perm);
75 kfree(b->h.items);
76 kfree(b);
77}
78
79void crush_destroy_bucket_list(struct crush_bucket_list *b)
80{
81 kfree(b->item_weights);
82 kfree(b->sum_weights);
83 kfree(b->h.perm);
84 kfree(b->h.items);
85 kfree(b);
86}
87
88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
89{
90 kfree(b->node_weights);
91 kfree(b);
92}
93
94void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
95{
96 kfree(b->straws);
97 kfree(b->item_weights);
98 kfree(b->h.perm);
99 kfree(b->h.items);
100 kfree(b);
101}
102
103void crush_destroy_bucket(struct crush_bucket *b)
104{
105 switch (b->alg) {
106 case CRUSH_BUCKET_UNIFORM:
107 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
108 break;
109 case CRUSH_BUCKET_LIST:
110 crush_destroy_bucket_list((struct crush_bucket_list *)b);
111 break;
112 case CRUSH_BUCKET_TREE:
113 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
114 break;
115 case CRUSH_BUCKET_STRAW:
116 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
117 break;
118 }
119}
120
121/**
122 * crush_destroy - Destroy a crush_map
123 * @map: crush_map pointer
124 */
125void crush_destroy(struct crush_map *map)
126{
127 int b;
128
129 /* buckets */
130 if (map->buckets) {
131 for (b = 0; b < map->max_buckets; b++) {
132 if (map->buckets[b] == NULL)
133 continue;
134 crush_destroy_bucket(map->buckets[b]);
135 }
136 kfree(map->buckets);
137 }
138
139 /* rules */
140 if (map->rules) {
141 for (b = 0; b < map->max_rules; b++)
142 kfree(map->rules[b]);
143 kfree(map->rules);
144 }
145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
148 kfree(map);
149}
150
151
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 000000000000..dcd7e7523700
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,180 @@
1#ifndef _CRUSH_CRUSH_H
2#define _CRUSH_CRUSH_H
3
4#include <linux/types.h>
5
6/*
7 * CRUSH is a pseudo-random data distribution algorithm that
8 * efficiently distributes input values (typically, data objects)
9 * across a heterogeneous, structured storage cluster.
10 *
11 * The algorithm was originally described in detail in this paper
12 * (although the algorithm has evolved somewhat since then):
13 *
14 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
15 *
16 * LGPL2
17 */
18
19
20#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
21
22
23#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
24#define CRUSH_MAX_SET 10 /* max size of a mapping result */
25
26
27/*
28 * CRUSH uses user-defined "rules" to describe how inputs should be
29 * mapped to devices. A rule consists of sequence of steps to perform
30 * to generate the set of output devices.
31 */
32struct crush_rule_step {
33 __u32 op;
34 __s32 arg1;
35 __s32 arg2;
36};
37
38/* step op codes */
39enum {
40 CRUSH_RULE_NOOP = 0,
41 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
42 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
43 /* arg2 = type */
44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
45 CRUSH_RULE_EMIT = 4, /* no args */
46 CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
47 CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
48};
49
50/*
51 * for specifying choose num (arg1) relative to the max parameter
52 * passed to do_rule
53 */
54#define CRUSH_CHOOSE_N 0
55#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
56
57/*
58 * The rule mask is used to describe what the rule is intended for.
59 * Given a ruleset and size of output set, we search through the
60 * rule list for a matching rule_mask.
61 */
62struct crush_rule_mask {
63 __u8 ruleset;
64 __u8 type;
65 __u8 min_size;
66 __u8 max_size;
67};
68
69struct crush_rule {
70 __u32 len;
71 struct crush_rule_mask mask;
72 struct crush_rule_step steps[0];
73};
74
75#define crush_rule_size(len) (sizeof(struct crush_rule) + \
76 (len)*sizeof(struct crush_rule_step))
77
78
79
80/*
81 * A bucket is a named container of other items (either devices or
82 * other buckets). Items within a bucket are chosen using one of a
83 * few different algorithms. The table summarizes how the speed of
84 * each option measures up against mapping stability when items are
85 * added or removed.
86 *
87 * Bucket Alg Speed Additions Removals
88 * ------------------------------------------------
89 * uniform O(1) poor poor
90 * list O(n) optimal poor
91 * tree O(log n) good good
92 * straw O(n) optimal optimal
93 */
94enum {
95 CRUSH_BUCKET_UNIFORM = 1,
96 CRUSH_BUCKET_LIST = 2,
97 CRUSH_BUCKET_TREE = 3,
98 CRUSH_BUCKET_STRAW = 4
99};
100extern const char *crush_bucket_alg_name(int alg);
101
102struct crush_bucket {
103 __s32 id; /* this'll be negative */
104 __u16 type; /* non-zero; type=0 is reserved for devices */
105 __u8 alg; /* one of CRUSH_BUCKET_* */
106 __u8 hash; /* which hash function to use, CRUSH_HASH_* */
107 __u32 weight; /* 16-bit fixed point */
108 __u32 size; /* num items */
109 __s32 *items;
110
111 /*
112 * cached random permutation: used for uniform bucket and for
113 * the linear search fallback for the other bucket types.
114 */
115 __u32 perm_x; /* @x for which *perm is defined */
116 __u32 perm_n; /* num elements of *perm that are permuted/defined */
117 __u32 *perm;
118};
119
120struct crush_bucket_uniform {
121 struct crush_bucket h;
122 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
123};
124
125struct crush_bucket_list {
126 struct crush_bucket h;
127 __u32 *item_weights; /* 16-bit fixed point */
128 __u32 *sum_weights; /* 16-bit fixed point. element i is sum
129 of weights 0..i, inclusive */
130};
131
132struct crush_bucket_tree {
133 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
134 actual items */
135 __u8 num_nodes;
136 __u32 *node_weights;
137};
138
139struct crush_bucket_straw {
140 struct crush_bucket h;
141 __u32 *item_weights; /* 16-bit fixed point */
142 __u32 *straws; /* 16-bit fixed point */
143};
144
145
146
147/*
148 * CRUSH map includes all buckets, rules, etc.
149 */
150struct crush_map {
151 struct crush_bucket **buckets;
152 struct crush_rule **rules;
153
154 /*
155 * Parent pointers to identify the parent bucket a device or
156 * bucket in the hierarchy. If an item appears more than
157 * once, this is the _last_ time it appeared (where buckets
158 * are processed in bucket id order, from -1 on down to
159 * -max_buckets.
160 */
161 __u32 *bucket_parents;
162 __u32 *device_parents;
163
164 __s32 max_buckets;
165 __u32 max_rules;
166 __s32 max_devices;
167};
168
169
170/* crush.c */
171extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
172extern void crush_calc_parents(struct crush_map *map);
173extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
174extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
175extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
176extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
177extern void crush_destroy_bucket(struct crush_bucket *b);
178extern void crush_destroy(struct crush_map *map);
179
180#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644
index 000000000000..5873aed694bf
--- /dev/null
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,149 @@
1
2#include <linux/types.h>
3#include "hash.h"
4
5/*
6 * Robert Jenkins' function for mixing 32-bit values
7 * http://burtleburtle.net/bob/hash/evahash.html
8 * a, b = random bits, c = input and output
9 */
10#define crush_hashmix(a, b, c) do { \
11 a = a-b; a = a-c; a = a^(c>>13); \
12 b = b-c; b = b-a; b = b^(a<<8); \
13 c = c-a; c = c-b; c = c^(b>>13); \
14 a = a-b; a = a-c; a = a^(c>>12); \
15 b = b-c; b = b-a; b = b^(a<<16); \
16 c = c-a; c = c-b; c = c^(b>>5); \
17 a = a-b; a = a-c; a = a^(c>>3); \
18 b = b-c; b = b-a; b = b^(a<<10); \
19 c = c-a; c = c-b; c = c^(b>>15); \
20 } while (0)
21
22#define crush_hash_seed 1315423911
23
24static __u32 crush_hash32_rjenkins1(__u32 a)
25{
26 __u32 hash = crush_hash_seed ^ a;
27 __u32 b = a;
28 __u32 x = 231232;
29 __u32 y = 1232;
30 crush_hashmix(b, x, hash);
31 crush_hashmix(y, a, hash);
32 return hash;
33}
34
35static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
36{
37 __u32 hash = crush_hash_seed ^ a ^ b;
38 __u32 x = 231232;
39 __u32 y = 1232;
40 crush_hashmix(a, b, hash);
41 crush_hashmix(x, a, hash);
42 crush_hashmix(b, y, hash);
43 return hash;
44}
45
46static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
47{
48 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
49 __u32 x = 231232;
50 __u32 y = 1232;
51 crush_hashmix(a, b, hash);
52 crush_hashmix(c, x, hash);
53 crush_hashmix(y, a, hash);
54 crush_hashmix(b, x, hash);
55 crush_hashmix(y, c, hash);
56 return hash;
57}
58
59static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
74 __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90
91__u32 crush_hash32(int type, __u32 a)
92{
93 switch (type) {
94 case CRUSH_HASH_RJENKINS1:
95 return crush_hash32_rjenkins1(a);
96 default:
97 return 0;
98 }
99}
100
101__u32 crush_hash32_2(int type, __u32 a, __u32 b)
102{
103 switch (type) {
104 case CRUSH_HASH_RJENKINS1:
105 return crush_hash32_rjenkins1_2(a, b);
106 default:
107 return 0;
108 }
109}
110
111__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
112{
113 switch (type) {
114 case CRUSH_HASH_RJENKINS1:
115 return crush_hash32_rjenkins1_3(a, b, c);
116 default:
117 return 0;
118 }
119}
120
121__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
122{
123 switch (type) {
124 case CRUSH_HASH_RJENKINS1:
125 return crush_hash32_rjenkins1_4(a, b, c, d);
126 default:
127 return 0;
128 }
129}
130
131__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
132{
133 switch (type) {
134 case CRUSH_HASH_RJENKINS1:
135 return crush_hash32_rjenkins1_5(a, b, c, d, e);
136 default:
137 return 0;
138 }
139}
140
141const char *crush_hash_name(int type)
142{
143 switch (type) {
144 case CRUSH_HASH_RJENKINS1:
145 return "rjenkins1";
146 default:
147 return "unknown";
148 }
149}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 000000000000..ff48e110e4bb
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,17 @@
1#ifndef _CRUSH_HASH_H
2#define _CRUSH_HASH_H
3
4#define CRUSH_HASH_RJENKINS1 0
5
6#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
7
8extern const char *crush_hash_name(int type);
9
10extern __u32 crush_hash32(int type, __u32 a);
11extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
12extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
13extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
14extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
15 __u32 e);
16
17#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 000000000000..9ba54efb6543
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,596 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include "crush.h"
22#include "hash.h"
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
142 r, bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
202 bucket->h.id) * (__u64)w;
203 t = t >> 32;
204
205 /* descend to the left or right? */
206 l = left(n);
207 if (t < bucket->node_weights[l])
208 n = l;
209 else
210 n = right(n);
211 }
212
213 return bucket->h.items[n >> 1];
214}
215
216
217/* straw */
218
219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
220 int x, int r)
221{
222 int i;
223 int high = 0;
224 __u64 high_draw = 0;
225 __u64 draw;
226
227 for (i = 0; i < bucket->h.size; i++) {
228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
229 draw &= 0xffff;
230 draw *= bucket->straws[i];
231 if (i == 0 || draw > high_draw) {
232 high = i;
233 high_draw = draw;
234 }
235 }
236 return bucket->h.items[high];
237}
238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{
241 dprintk("choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
245 x, r);
246 case CRUSH_BUCKET_LIST:
247 return bucket_list_choose((struct crush_bucket_list *)in,
248 x, r);
249 case CRUSH_BUCKET_TREE:
250 return bucket_tree_choose((struct crush_bucket_tree *)in,
251 x, r);
252 case CRUSH_BUCKET_STRAW:
253 return bucket_straw_choose((struct crush_bucket_straw *)in,
254 x, r);
255 default:
256 BUG_ON(1);
257 return in->items[0];
258 }
259}
260
261/*
262 * true if device is marked "out" (failed, fully offloaded)
263 * of the cluster
264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{
267 if (weight[item] >= 0x1000)
268 return 0;
269 if (weight[item] == 0)
270 return 1;
271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
272 < weight[item])
273 return 0;
274 return 1;
275}
276
277/**
278 * crush_choose - choose numrep distinct items of given type
279 * @map: the crush_map
280 * @bucket: the bucket we are choose an item from
281 * @x: crush input value
282 * @numrep: the number of items to choose
283 * @type: the type of item to choose
284 * @out: pointer to output vector
285 * @outpos: our position in that vector
286 * @firstn: true if choosing "first n" items, false if choosing "indep"
287 * @recurse_to_leaf: true if we want one device under each item of given type
288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
289 */
290static int crush_choose(struct crush_map *map,
291 struct crush_bucket *bucket,
292 __u32 *weight,
293 int x, int numrep, int type,
294 int *out, int outpos,
295 int firstn, int recurse_to_leaf,
296 int *out2)
297{
298 int rep;
299 int ftotal, flocal;
300 int retry_descent, retry_bucket, skip_rep;
301 struct crush_bucket *in = bucket;
302 int r;
303 int i;
304 int item = 0;
305 int itemtype;
306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
308 dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
309
310 for (rep = outpos; rep < numrep; rep++) {
311 /* keep trying until we get a non-out, non-colliding item */
312 ftotal = 0;
313 skip_rep = 0;
314 do {
315 retry_descent = 0;
316 in = bucket; /* initial bucket */
317
318 /* choose through intervening buckets */
319 flocal = 0;
320 do {
321 collide = 0;
322 retry_bucket = 0;
323 r = rep;
324 if (in->alg == CRUSH_BUCKET_UNIFORM) {
325 /* be careful */
326 if (firstn || numrep >= in->size)
327 /* r' = r + f_total */
328 r += ftotal;
329 else if (in->size % numrep == 0)
330 /* r'=r+(n+1)*f_local */
331 r += (numrep+1) *
332 (flocal+ftotal);
333 else
334 /* r' = r + n*f_local */
335 r += numrep * (flocal+ftotal);
336 } else {
337 if (firstn)
338 /* r' = r + f_total */
339 r += ftotal;
340 else
341 /* r' = r + n*f_local */
342 r += numrep * (flocal+ftotal);
343 }
344
345 /* bucket choose */
346 if (in->size == 0) {
347 reject = 1;
348 goto reject;
349 }
350 if (flocal >= (in->size>>1) &&
351 flocal > orig_tries)
352 item = bucket_perm_choose(in, x, r);
353 else
354 item = crush_bucket_choose(in, x, r);
355 BUG_ON(item >= map->max_devices);
356
357 /* desired type? */
358 if (item < 0)
359 itemtype = map->buckets[-1-item]->type;
360 else
361 itemtype = 0;
362 dprintk(" item %d type %d\n", item, itemtype);
363
364 /* keep going? */
365 if (itemtype != type) {
366 BUG_ON(item >= 0 ||
367 (-1-item) >= map->max_buckets);
368 in = map->buckets[-1-item];
369 continue;
370 }
371
372 /* collision? */
373 for (i = 0; i < outpos; i++) {
374 if (out[i] == item) {
375 collide = 1;
376 break;
377 }
378 }
379
380 if (recurse_to_leaf &&
381 item < 0 &&
382 crush_choose(map, map->buckets[-1-item],
383 weight,
384 x, outpos+1, 0,
385 out2, outpos,
386 firstn, 0, NULL) <= outpos) {
387 reject = 1;
388 } else {
389 /* out? */
390 if (itemtype == 0)
391 reject = is_out(map, weight,
392 item, x);
393 else
394 reject = 0;
395 }
396
397reject:
398 if (reject || collide) {
399 ftotal++;
400 flocal++;
401
402 if (collide && flocal < 3)
403 /* retry locally a few times */
404 retry_bucket = 1;
405 else if (flocal < in->size + orig_tries)
406 /* exhaustive bucket search */
407 retry_bucket = 1;
408 else if (ftotal < 20)
409 /* then retry descent */
410 retry_descent = 1;
411 else
412 /* else give up */
413 skip_rep = 1;
414 dprintk(" reject %d collide %d "
415 "ftotal %d flocal %d\n",
416 reject, collide, ftotal,
417 flocal);
418 }
419 } while (retry_bucket);
420 } while (retry_descent);
421
422 if (skip_rep) {
423 dprintk("skip rep\n");
424 continue;
425 }
426
427 dprintk("choose got %d\n", item);
428 out[outpos] = item;
429 outpos++;
430 }
431
432 dprintk("choose returns %d\n", outpos);
433 return outpos;
434}
435
436
437/**
438 * crush_do_rule - calculate a mapping with the given input and rule
439 * @map: the crush_map
440 * @ruleno: the rule id
441 * @x: hash input
442 * @result: pointer to result vector
443 * @result_max: maximum result size
444 * @force: force initial replica choice; -1 for none
445 */
446int crush_do_rule(struct crush_map *map,
447 int ruleno, int x, int *result, int result_max,
448 int force, __u32 *weight)
449{
450 int result_len;
451 int force_context[CRUSH_MAX_DEPTH];
452 int force_pos = -1;
453 int a[CRUSH_MAX_SET];
454 int b[CRUSH_MAX_SET];
455 int c[CRUSH_MAX_SET];
456 int recurse_to_leaf;
457 int *w;
458 int wsize = 0;
459 int *o;
460 int osize;
461 int *tmp;
462 struct crush_rule *rule;
463 int step;
464 int i, j;
465 int numrep;
466 int firstn;
467 int rc = -1;
468
469 BUG_ON(ruleno >= map->max_rules);
470
471 rule = map->rules[ruleno];
472 result_len = 0;
473 w = a;
474 o = b;
475
476 /*
477 * determine hierarchical context of force, if any. note
478 * that this may or may not correspond to the specific types
479 * referenced by the crush rule.
480 */
481 if (force >= 0) {
482 if (force >= map->max_devices ||
483 map->device_parents[force] == 0) {
484 /*dprintk("CRUSH: forcefed device dne\n");*/
485 rc = -1; /* force fed device dne */
486 goto out;
487 }
488 if (!is_out(map, weight, force, x)) {
489 while (1) {
490 force_context[++force_pos] = force;
491 if (force >= 0)
492 force = map->device_parents[force];
493 else
494 force = map->bucket_parents[-1-force];
495 if (force == 0)
496 break;
497 }
498 }
499 }
500
501 for (step = 0; step < rule->len; step++) {
502 firstn = 0;
503 switch (rule->steps[step].op) {
504 case CRUSH_RULE_TAKE:
505 w[0] = rule->steps[step].arg1;
506 if (force_pos >= 0) {
507 BUG_ON(force_context[force_pos] != w[0]);
508 force_pos--;
509 }
510 wsize = 1;
511 break;
512
513 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
514 case CRUSH_RULE_CHOOSE_FIRSTN:
515 firstn = 1;
516 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
517 case CRUSH_RULE_CHOOSE_INDEP:
518 BUG_ON(wsize == 0);
519
520 recurse_to_leaf =
521 rule->steps[step].op ==
522 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
523 rule->steps[step].op ==
524 CRUSH_RULE_CHOOSE_LEAF_INDEP;
525
526 /* reset output */
527 osize = 0;
528
529 for (i = 0; i < wsize; i++) {
530 /*
531 * see CRUSH_N, CRUSH_N_MINUS macros.
532 * basically, numrep <= 0 means relative to
533 * the provided result_max
534 */
535 numrep = rule->steps[step].arg1;
536 if (numrep <= 0) {
537 numrep += result_max;
538 if (numrep <= 0)
539 continue;
540 }
541 j = 0;
542 if (osize == 0 && force_pos >= 0) {
543 /* skip any intermediate types */
544 while (force_pos &&
545 force_context[force_pos] < 0 &&
546 rule->steps[step].arg2 !=
547 map->buckets[-1 -
548 force_context[force_pos]]->type)
549 force_pos--;
550 o[osize] = force_context[force_pos];
551 if (recurse_to_leaf)
552 c[osize] = force_context[0];
553 j++;
554 force_pos--;
555 }
556 osize += crush_choose(map,
557 map->buckets[-1-w[i]],
558 weight,
559 x, numrep,
560 rule->steps[step].arg2,
561 o+osize, j,
562 firstn,
563 recurse_to_leaf, c+osize);
564 }
565
566 if (recurse_to_leaf)
567 /* copy final _leaf_ values to output set */
568 memcpy(o, c, osize*sizeof(*o));
569
570 /* swap t and w arrays */
571 tmp = o;
572 o = w;
573 w = tmp;
574 wsize = osize;
575 break;
576
577
578 case CRUSH_RULE_EMIT:
579 for (i = 0; i < wsize && result_len < result_max; i++) {
580 result[result_len] = w[i];
581 result_len++;
582 }
583 wsize = 0;
584 break;
585
586 default:
587 BUG_ON(1);
588 }
589 }
590 rc = result_len;
591
592out:
593 return rc;
594}
595
596
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 000000000000..98e90046fd9f
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
1#ifndef _CRUSH_MAPPER_H
2#define _CRUSH_MAPPER_H
3
4/*
5 * CRUSH functions for find rules and then mapping an input to an
6 * output set.
7 *
8 * LGPL2
9 */
10
11#include "crush.h"
12
13extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
14extern int crush_do_rule(struct crush_map *map,
15 int ruleno,
16 int x, int *result, int result_max,
17 int forcefeed, /* -1 for none */
18 __u32 *weights);
19
20#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 000000000000..f704b3b62424
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,409 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/scatterlist.h>
6#include <linux/slab.h>
7#include <crypto/hash.h>
8
9#include "crypto.h"
10#include "decode.h"
11
12int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
13{
14 if (*p + sizeof(u16) + sizeof(key->created) +
15 sizeof(u16) + key->len > end)
16 return -ERANGE;
17 ceph_encode_16(p, key->type);
18 ceph_encode_copy(p, &key->created, sizeof(key->created));
19 ceph_encode_16(p, key->len);
20 ceph_encode_copy(p, key->key, key->len);
21 return 0;
22}
23
24int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
25{
26 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
27 key->type = ceph_decode_16(p);
28 ceph_decode_copy(p, &key->created, sizeof(key->created));
29 key->len = ceph_decode_16(p);
30 ceph_decode_need(p, end, key->len, bad);
31 key->key = kmalloc(key->len, GFP_NOFS);
32 if (!key->key)
33 return -ENOMEM;
34 ceph_decode_copy(p, key->key, key->len);
35 return 0;
36
37bad:
38 dout("failed to decode crypto key\n");
39 return -EINVAL;
40}
41
42int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
43{
44 int inlen = strlen(inkey);
45 int blen = inlen * 3 / 4;
46 void *buf, *p;
47 int ret;
48
49 dout("crypto_key_unarmor %s\n", inkey);
50 buf = kmalloc(blen, GFP_NOFS);
51 if (!buf)
52 return -ENOMEM;
53 blen = ceph_unarmor(buf, inkey, inkey+inlen);
54 if (blen < 0) {
55 kfree(buf);
56 return blen;
57 }
58
59 p = buf;
60 ret = ceph_crypto_key_decode(key, &p, p + blen);
61 kfree(buf);
62 if (ret)
63 return ret;
64 dout("crypto_key_unarmor key %p type %d len %d\n", key,
65 key->type, key->len);
66 return 0;
67}
68
69
70
71#define AES_KEY_SIZE 16
72
73static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
74{
75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
76}
77
78const u8 *aes_iv = "cephsageyudagreg";
79
80int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
81 const void *src, size_t src_len)
82{
83 struct scatterlist sg_in[2], sg_out[1];
84 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
85 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
86 int ret;
87 void *iv;
88 int ivsize;
89 size_t zero_padding = (0x10 - (src_len & 0x0f));
90 char pad[16];
91
92 if (IS_ERR(tfm))
93 return PTR_ERR(tfm);
94
95 memset(pad, zero_padding, zero_padding);
96
97 *dst_len = src_len + zero_padding;
98
99 crypto_blkcipher_setkey((void *)tfm, key, key_len);
100 sg_init_table(sg_in, 2);
101 sg_set_buf(&sg_in[0], src, src_len);
102 sg_set_buf(&sg_in[1], pad, zero_padding);
103 sg_init_table(sg_out, 1);
104 sg_set_buf(sg_out, dst, *dst_len);
105 iv = crypto_blkcipher_crt(tfm)->iv;
106 ivsize = crypto_blkcipher_ivsize(tfm);
107
108 memcpy(iv, aes_iv, ivsize);
109 /*
110 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
111 key, key_len, 1);
112 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
113 src, src_len, 1);
114 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
115 pad, zero_padding, 1);
116 */
117 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
118 src_len + zero_padding);
119 crypto_free_blkcipher(tfm);
120 if (ret < 0)
121 pr_err("ceph_aes_crypt failed %d\n", ret);
122 /*
123 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
124 dst, *dst_len, 1);
125 */
126 return 0;
127}
128
129int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
130 const void *src1, size_t src1_len,
131 const void *src2, size_t src2_len)
132{
133 struct scatterlist sg_in[3], sg_out[1];
134 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
135 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
136 int ret;
137 void *iv;
138 int ivsize;
139 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
140 char pad[16];
141
142 if (IS_ERR(tfm))
143 return PTR_ERR(tfm);
144
145 memset(pad, zero_padding, zero_padding);
146
147 *dst_len = src1_len + src2_len + zero_padding;
148
149 crypto_blkcipher_setkey((void *)tfm, key, key_len);
150 sg_init_table(sg_in, 3);
151 sg_set_buf(&sg_in[0], src1, src1_len);
152 sg_set_buf(&sg_in[1], src2, src2_len);
153 sg_set_buf(&sg_in[2], pad, zero_padding);
154 sg_init_table(sg_out, 1);
155 sg_set_buf(sg_out, dst, *dst_len);
156 iv = crypto_blkcipher_crt(tfm)->iv;
157 ivsize = crypto_blkcipher_ivsize(tfm);
158
159 memcpy(iv, aes_iv, ivsize);
160 /*
161 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
162 key, key_len, 1);
163 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
164 src1, src1_len, 1);
165 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
166 src2, src2_len, 1);
167 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
168 pad, zero_padding, 1);
169 */
170 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
171 src1_len + src2_len + zero_padding);
172 crypto_free_blkcipher(tfm);
173 if (ret < 0)
174 pr_err("ceph_aes_crypt2 failed %d\n", ret);
175 /*
176 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
177 dst, *dst_len, 1);
178 */
179 return 0;
180}
181
182int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
183 const void *src, size_t src_len)
184{
185 struct scatterlist sg_in[1], sg_out[2];
186 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
187 struct blkcipher_desc desc = { .tfm = tfm };
188 char pad[16];
189 void *iv;
190 int ivsize;
191 int ret;
192 int last_byte;
193
194 if (IS_ERR(tfm))
195 return PTR_ERR(tfm);
196
197 crypto_blkcipher_setkey((void *)tfm, key, key_len);
198 sg_init_table(sg_in, 1);
199 sg_init_table(sg_out, 2);
200 sg_set_buf(sg_in, src, src_len);
201 sg_set_buf(&sg_out[0], dst, *dst_len);
202 sg_set_buf(&sg_out[1], pad, sizeof(pad));
203
204 iv = crypto_blkcipher_crt(tfm)->iv;
205 ivsize = crypto_blkcipher_ivsize(tfm);
206
207 memcpy(iv, aes_iv, ivsize);
208
209 /*
210 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
211 key, key_len, 1);
212 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
213 src, src_len, 1);
214 */
215
216 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
217 crypto_free_blkcipher(tfm);
218 if (ret < 0) {
219 pr_err("ceph_aes_decrypt failed %d\n", ret);
220 return ret;
221 }
222
223 if (src_len <= *dst_len)
224 last_byte = ((char *)dst)[src_len - 1];
225 else
226 last_byte = pad[src_len - *dst_len - 1];
227 if (last_byte <= 16 && src_len >= last_byte) {
228 *dst_len = src_len - last_byte;
229 } else {
230 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
231 last_byte, (int)src_len);
232 return -EPERM; /* bad padding */
233 }
234 /*
235 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
236 dst, *dst_len, 1);
237 */
238 return 0;
239}
240
241int ceph_aes_decrypt2(const void *key, int key_len,
242 void *dst1, size_t *dst1_len,
243 void *dst2, size_t *dst2_len,
244 const void *src, size_t src_len)
245{
246 struct scatterlist sg_in[1], sg_out[3];
247 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
248 struct blkcipher_desc desc = { .tfm = tfm };
249 char pad[16];
250 void *iv;
251 int ivsize;
252 int ret;
253 int last_byte;
254
255 if (IS_ERR(tfm))
256 return PTR_ERR(tfm);
257
258 sg_init_table(sg_in, 1);
259 sg_set_buf(sg_in, src, src_len);
260 sg_init_table(sg_out, 3);
261 sg_set_buf(&sg_out[0], dst1, *dst1_len);
262 sg_set_buf(&sg_out[1], dst2, *dst2_len);
263 sg_set_buf(&sg_out[2], pad, sizeof(pad));
264
265 crypto_blkcipher_setkey((void *)tfm, key, key_len);
266 iv = crypto_blkcipher_crt(tfm)->iv;
267 ivsize = crypto_blkcipher_ivsize(tfm);
268
269 memcpy(iv, aes_iv, ivsize);
270
271 /*
272 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
273 key, key_len, 1);
274 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
275 src, src_len, 1);
276 */
277
278 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
279 crypto_free_blkcipher(tfm);
280 if (ret < 0) {
281 pr_err("ceph_aes_decrypt failed %d\n", ret);
282 return ret;
283 }
284
285 if (src_len <= *dst1_len)
286 last_byte = ((char *)dst1)[src_len - 1];
287 else if (src_len <= *dst1_len + *dst2_len)
288 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
289 else
290 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
291 if (last_byte <= 16 && src_len >= last_byte) {
292 src_len -= last_byte;
293 } else {
294 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
295 last_byte, (int)src_len);
296 return -EPERM; /* bad padding */
297 }
298
299 if (src_len < *dst1_len) {
300 *dst1_len = src_len;
301 *dst2_len = 0;
302 } else {
303 *dst2_len = src_len - *dst1_len;
304 }
305 /*
306 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
307 dst1, *dst1_len, 1);
308 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
309 dst2, *dst2_len, 1);
310 */
311
312 return 0;
313}
314
315
316int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
317 const void *src, size_t src_len)
318{
319 switch (secret->type) {
320 case CEPH_CRYPTO_NONE:
321 if (*dst_len < src_len)
322 return -ERANGE;
323 memcpy(dst, src, src_len);
324 *dst_len = src_len;
325 return 0;
326
327 case CEPH_CRYPTO_AES:
328 return ceph_aes_decrypt(secret->key, secret->len, dst,
329 dst_len, src, src_len);
330
331 default:
332 return -EINVAL;
333 }
334}
335
336int ceph_decrypt2(struct ceph_crypto_key *secret,
337 void *dst1, size_t *dst1_len,
338 void *dst2, size_t *dst2_len,
339 const void *src, size_t src_len)
340{
341 size_t t;
342
343 switch (secret->type) {
344 case CEPH_CRYPTO_NONE:
345 if (*dst1_len + *dst2_len < src_len)
346 return -ERANGE;
347 t = min(*dst1_len, src_len);
348 memcpy(dst1, src, t);
349 *dst1_len = t;
350 src += t;
351 src_len -= t;
352 if (src_len) {
353 t = min(*dst2_len, src_len);
354 memcpy(dst2, src, t);
355 *dst2_len = t;
356 }
357 return 0;
358
359 case CEPH_CRYPTO_AES:
360 return ceph_aes_decrypt2(secret->key, secret->len,
361 dst1, dst1_len, dst2, dst2_len,
362 src, src_len);
363
364 default:
365 return -EINVAL;
366 }
367}
368
369int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
370 const void *src, size_t src_len)
371{
372 switch (secret->type) {
373 case CEPH_CRYPTO_NONE:
374 if (*dst_len < src_len)
375 return -ERANGE;
376 memcpy(dst, src, src_len);
377 *dst_len = src_len;
378 return 0;
379
380 case CEPH_CRYPTO_AES:
381 return ceph_aes_encrypt(secret->key, secret->len, dst,
382 dst_len, src, src_len);
383
384 default:
385 return -EINVAL;
386 }
387}
388
389int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
390 const void *src1, size_t src1_len,
391 const void *src2, size_t src2_len)
392{
393 switch (secret->type) {
394 case CEPH_CRYPTO_NONE:
395 if (*dst_len < src1_len + src2_len)
396 return -ERANGE;
397 memcpy(dst, src1, src1_len);
398 memcpy(dst + src1_len, src2, src2_len);
399 *dst_len = src1_len + src2_len;
400 return 0;
401
402 case CEPH_CRYPTO_AES:
403 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
404 src1, src1_len, src2, src2_len);
405
406 default:
407 return -EINVAL;
408 }
409}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 000000000000..40b502e6bd89
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,48 @@
1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * cryptographic secret
9 */
10struct ceph_crypto_key {
11 int type;
12 struct ceph_timespec created;
13 int len;
14 void *key;
15};
16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 kfree(key->key);
20}
21
22extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
23 void **p, void *end);
24extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
25 void **p, void *end);
26extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
27
28/* crypto.c */
29extern int ceph_decrypt(struct ceph_crypto_key *secret,
30 void *dst, size_t *dst_len,
31 const void *src, size_t src_len);
32extern int ceph_encrypt(struct ceph_crypto_key *secret,
33 void *dst, size_t *dst_len,
34 const void *src, size_t src_len);
35extern int ceph_decrypt2(struct ceph_crypto_key *secret,
36 void *dst1, size_t *dst1_len,
37 void *dst2, size_t *dst2_len,
38 const void *src, size_t src_len);
39extern int ceph_encrypt2(struct ceph_crypto_key *secret,
40 void *dst, size_t *dst_len,
41 const void *src1, size_t src1_len,
42 const void *src2, size_t src2_len);
43
44/* armor.c */
45extern int ceph_armor(char *dst, const void *src, const void *end);
46extern int ceph_unarmor(void *dst, const char *src, const char *end);
47
48#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 000000000000..f7048da92acc
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,484 @@
1#include "ceph_debug.h"
2
3#include <linux/device.h>
4#include <linux/slab.h>
5#include <linux/module.h>
6#include <linux/ctype.h>
7#include <linux/debugfs.h>
8#include <linux/seq_file.h>
9
10#include "super.h"
11#include "mds_client.h"
12#include "mon_client.h"
13#include "auth.h"
14
15#ifdef CONFIG_DEBUG_FS
16
17/*
18 * Implement /sys/kernel/debug/ceph fun
19 *
20 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
21 * .../osdmap - current osdmap
22 * .../mdsmap - current mdsmap
23 * .../monmap - current monmap
24 * .../osdc - active osd requests
25 * .../mdsc - active mds requests
26 * .../monc - mon client state
27 * .../dentry_lru - dump contents of dentry lru
28 * .../caps - expose cap (reservation) stats
29 * .../bdi - symlink to ../../bdi/something
30 */
31
32static struct dentry *ceph_debugfs_dir;
33
34static int monmap_show(struct seq_file *s, void *p)
35{
36 int i;
37 struct ceph_client *client = s->private;
38
39 if (client->monc.monmap == NULL)
40 return 0;
41
42 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
43 for (i = 0; i < client->monc.monmap->num_mon; i++) {
44 struct ceph_entity_inst *inst =
45 &client->monc.monmap->mon_inst[i];
46
47 seq_printf(s, "\t%s%lld\t%s\n",
48 ENTITY_NAME(inst->name),
49 pr_addr(&inst->addr.in_addr));
50 }
51 return 0;
52}
53
54static int mdsmap_show(struct seq_file *s, void *p)
55{
56 int i;
57 struct ceph_client *client = s->private;
58
59 if (client->mdsc.mdsmap == NULL)
60 return 0;
61 seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
62 seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
63 seq_printf(s, "session_timeout %d\n",
64 client->mdsc.mdsmap->m_session_timeout);
65 seq_printf(s, "session_autoclose %d\n",
66 client->mdsc.mdsmap->m_session_autoclose);
67 for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
68 struct ceph_entity_addr *addr =
69 &client->mdsc.mdsmap->m_info[i].addr;
70 int state = client->mdsc.mdsmap->m_info[i].state;
71
72 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
73 ceph_mds_state_name(state));
74 }
75 return 0;
76}
77
78static int osdmap_show(struct seq_file *s, void *p)
79{
80 int i;
81 struct ceph_client *client = s->private;
82 struct rb_node *n;
83
84 if (client->osdc.osdmap == NULL)
85 return 0;
86 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
87 seq_printf(s, "flags%s%s\n",
88 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
89 " NEARFULL" : "",
90 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
91 " FULL" : "");
92 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
93 struct ceph_pg_pool_info *pool =
94 rb_entry(n, struct ceph_pg_pool_info, node);
95 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
96 pool->id, pool->v.pg_num, pool->pg_num_mask,
97 pool->v.lpg_num, pool->lpg_num_mask);
98 }
99 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
100 struct ceph_entity_addr *addr =
101 &client->osdc.osdmap->osd_addr[i];
102 int state = client->osdc.osdmap->osd_state[i];
103 char sb[64];
104
105 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
106 i, pr_addr(&addr->in_addr),
107 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
108 ceph_osdmap_state_str(sb, sizeof(sb), state));
109 }
110 return 0;
111}
112
113static int monc_show(struct seq_file *s, void *p)
114{
115 struct ceph_client *client = s->private;
116 struct ceph_mon_statfs_request *req;
117 struct ceph_mon_client *monc = &client->monc;
118 struct rb_node *rp;
119
120 mutex_lock(&monc->mutex);
121
122 if (monc->have_mdsmap)
123 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
124 if (monc->have_osdmap)
125 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
126 if (monc->want_next_osdmap)
127 seq_printf(s, "want next osdmap\n");
128
129 for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
130 req = rb_entry(rp, struct ceph_mon_statfs_request, node);
131 seq_printf(s, "%lld statfs\n", req->tid);
132 }
133
134 mutex_unlock(&monc->mutex);
135 return 0;
136}
137
138static int mdsc_show(struct seq_file *s, void *p)
139{
140 struct ceph_client *client = s->private;
141 struct ceph_mds_client *mdsc = &client->mdsc;
142 struct ceph_mds_request *req;
143 struct rb_node *rp;
144 int pathlen;
145 u64 pathbase;
146 char *path;
147
148 mutex_lock(&mdsc->mutex);
149 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
150 req = rb_entry(rp, struct ceph_mds_request, r_node);
151
152 if (req->r_request)
153 seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
154 else
155 seq_printf(s, "%lld\t(no request)\t", req->r_tid);
156
157 seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
158
159 if (req->r_got_unsafe)
160 seq_printf(s, "\t(unsafe)");
161 else
162 seq_printf(s, "\t");
163
164 if (req->r_inode) {
165 seq_printf(s, " #%llx", ceph_ino(req->r_inode));
166 } else if (req->r_dentry) {
167 path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
168 &pathbase, 0);
169 spin_lock(&req->r_dentry->d_lock);
170 seq_printf(s, " #%llx/%.*s (%s)",
171 ceph_ino(req->r_dentry->d_parent->d_inode),
172 req->r_dentry->d_name.len,
173 req->r_dentry->d_name.name,
174 path ? path : "");
175 spin_unlock(&req->r_dentry->d_lock);
176 kfree(path);
177 } else if (req->r_path1) {
178 seq_printf(s, " #%llx/%s", req->r_ino1.ino,
179 req->r_path1);
180 }
181
182 if (req->r_old_dentry) {
183 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
184 &pathbase, 0);
185 spin_lock(&req->r_old_dentry->d_lock);
186 seq_printf(s, " #%llx/%.*s (%s)",
187 ceph_ino(req->r_old_dentry->d_parent->d_inode),
188 req->r_old_dentry->d_name.len,
189 req->r_old_dentry->d_name.name,
190 path ? path : "");
191 spin_unlock(&req->r_old_dentry->d_lock);
192 kfree(path);
193 } else if (req->r_path2) {
194 if (req->r_ino2.ino)
195 seq_printf(s, " #%llx/%s", req->r_ino2.ino,
196 req->r_path2);
197 else
198 seq_printf(s, " %s", req->r_path2);
199 }
200
201 seq_printf(s, "\n");
202 }
203 mutex_unlock(&mdsc->mutex);
204
205 return 0;
206}
207
208static int osdc_show(struct seq_file *s, void *pp)
209{
210 struct ceph_client *client = s->private;
211 struct ceph_osd_client *osdc = &client->osdc;
212 struct rb_node *p;
213
214 mutex_lock(&osdc->request_mutex);
215 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
216 struct ceph_osd_request *req;
217 struct ceph_osd_request_head *head;
218 struct ceph_osd_op *op;
219 int num_ops;
220 int opcode, olen;
221 int i;
222
223 req = rb_entry(p, struct ceph_osd_request, r_node);
224
225 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
226 req->r_osd ? req->r_osd->o_osd : -1,
227 le32_to_cpu(req->r_pgid.pool),
228 le16_to_cpu(req->r_pgid.ps));
229
230 head = req->r_request->front.iov_base;
231 op = (void *)(head + 1);
232
233 num_ops = le16_to_cpu(head->num_ops);
234 olen = le32_to_cpu(head->object_len);
235 seq_printf(s, "%.*s", olen,
236 (const char *)(head->ops + num_ops));
237
238 if (req->r_reassert_version.epoch)
239 seq_printf(s, "\t%u'%llu",
240 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
241 le64_to_cpu(req->r_reassert_version.version));
242 else
243 seq_printf(s, "\t");
244
245 for (i = 0; i < num_ops; i++) {
246 opcode = le16_to_cpu(op->op);
247 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
248 op++;
249 }
250
251 seq_printf(s, "\n");
252 }
253 mutex_unlock(&osdc->request_mutex);
254 return 0;
255}
256
257static int caps_show(struct seq_file *s, void *p)
258{
259 struct ceph_client *client = p;
260 int total, avail, used, reserved, min;
261
262 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
263 seq_printf(s, "total\t\t%d\n"
264 "avail\t\t%d\n"
265 "used\t\t%d\n"
266 "reserved\t%d\n"
267 "min\t%d\n",
268 total, avail, used, reserved, min);
269 return 0;
270}
271
272static int dentry_lru_show(struct seq_file *s, void *ptr)
273{
274 struct ceph_client *client = s->private;
275 struct ceph_mds_client *mdsc = &client->mdsc;
276 struct ceph_dentry_info *di;
277
278 spin_lock(&mdsc->dentry_lru_lock);
279 list_for_each_entry(di, &mdsc->dentry_lru, lru) {
280 struct dentry *dentry = di->dentry;
281 seq_printf(s, "%p %p\t%.*s\n",
282 di, dentry, dentry->d_name.len, dentry->d_name.name);
283 }
284 spin_unlock(&mdsc->dentry_lru_lock);
285
286 return 0;
287}
288
289#define DEFINE_SHOW_FUNC(name) \
290static int name##_open(struct inode *inode, struct file *file) \
291{ \
292 struct seq_file *sf; \
293 int ret; \
294 \
295 ret = single_open(file, name, NULL); \
296 sf = file->private_data; \
297 sf->private = inode->i_private; \
298 return ret; \
299} \
300 \
301static const struct file_operations name##_fops = { \
302 .open = name##_open, \
303 .read = seq_read, \
304 .llseek = seq_lseek, \
305 .release = single_release, \
306};
307
308DEFINE_SHOW_FUNC(monmap_show)
309DEFINE_SHOW_FUNC(mdsmap_show)
310DEFINE_SHOW_FUNC(osdmap_show)
311DEFINE_SHOW_FUNC(monc_show)
312DEFINE_SHOW_FUNC(mdsc_show)
313DEFINE_SHOW_FUNC(osdc_show)
314DEFINE_SHOW_FUNC(dentry_lru_show)
315DEFINE_SHOW_FUNC(caps_show)
316
317static int congestion_kb_set(void *data, u64 val)
318{
319 struct ceph_client *client = (struct ceph_client *)data;
320
321 if (client)
322 client->mount_args->congestion_kb = (int)val;
323
324 return 0;
325}
326
327static int congestion_kb_get(void *data, u64 *val)
328{
329 struct ceph_client *client = (struct ceph_client *)data;
330
331 if (client)
332 *val = (u64)client->mount_args->congestion_kb;
333
334 return 0;
335}
336
337
338DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
339 congestion_kb_set, "%llu\n");
340
341int __init ceph_debugfs_init(void)
342{
343 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
344 if (!ceph_debugfs_dir)
345 return -ENOMEM;
346 return 0;
347}
348
349void ceph_debugfs_cleanup(void)
350{
351 debugfs_remove(ceph_debugfs_dir);
352}
353
354int ceph_debugfs_client_init(struct ceph_client *client)
355{
356 int ret = 0;
357 char name[80];
358
359 snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
360 PR_FSID(&client->fsid), client->monc.auth->global_id);
361
362 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
363 if (!client->debugfs_dir)
364 goto out;
365
366 client->monc.debugfs_file = debugfs_create_file("monc",
367 0600,
368 client->debugfs_dir,
369 client,
370 &monc_show_fops);
371 if (!client->monc.debugfs_file)
372 goto out;
373
374 client->mdsc.debugfs_file = debugfs_create_file("mdsc",
375 0600,
376 client->debugfs_dir,
377 client,
378 &mdsc_show_fops);
379 if (!client->mdsc.debugfs_file)
380 goto out;
381
382 client->osdc.debugfs_file = debugfs_create_file("osdc",
383 0600,
384 client->debugfs_dir,
385 client,
386 &osdc_show_fops);
387 if (!client->osdc.debugfs_file)
388 goto out;
389
390 client->debugfs_monmap = debugfs_create_file("monmap",
391 0600,
392 client->debugfs_dir,
393 client,
394 &monmap_show_fops);
395 if (!client->debugfs_monmap)
396 goto out;
397
398 client->debugfs_mdsmap = debugfs_create_file("mdsmap",
399 0600,
400 client->debugfs_dir,
401 client,
402 &mdsmap_show_fops);
403 if (!client->debugfs_mdsmap)
404 goto out;
405
406 client->debugfs_osdmap = debugfs_create_file("osdmap",
407 0600,
408 client->debugfs_dir,
409 client,
410 &osdmap_show_fops);
411 if (!client->debugfs_osdmap)
412 goto out;
413
414 client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
415 0600,
416 client->debugfs_dir,
417 client,
418 &dentry_lru_show_fops);
419 if (!client->debugfs_dentry_lru)
420 goto out;
421
422 client->debugfs_caps = debugfs_create_file("caps",
423 0400,
424 client->debugfs_dir,
425 client,
426 &caps_show_fops);
427 if (!client->debugfs_caps)
428 goto out;
429
430 client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
431 0600,
432 client->debugfs_dir,
433 client,
434 &congestion_kb_fops);
435 if (!client->debugfs_congestion_kb)
436 goto out;
437
438 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
439 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
440 name);
441
442 return 0;
443
444out:
445 ceph_debugfs_client_cleanup(client);
446 return ret;
447}
448
449void ceph_debugfs_client_cleanup(struct ceph_client *client)
450{
451 debugfs_remove(client->debugfs_bdi);
452 debugfs_remove(client->debugfs_caps);
453 debugfs_remove(client->debugfs_dentry_lru);
454 debugfs_remove(client->debugfs_osdmap);
455 debugfs_remove(client->debugfs_mdsmap);
456 debugfs_remove(client->debugfs_monmap);
457 debugfs_remove(client->osdc.debugfs_file);
458 debugfs_remove(client->mdsc.debugfs_file);
459 debugfs_remove(client->monc.debugfs_file);
460 debugfs_remove(client->debugfs_congestion_kb);
461 debugfs_remove(client->debugfs_dir);
462}
463
464#else // CONFIG_DEBUG_FS
465
466int __init ceph_debugfs_init(void)
467{
468 return 0;
469}
470
471void ceph_debugfs_cleanup(void)
472{
473}
474
475int ceph_debugfs_client_init(struct ceph_client *client)
476{
477 return 0;
478}
479
480void ceph_debugfs_client_cleanup(struct ceph_client *client)
481{
482}
483
484#endif // CONFIG_DEBUG_FS
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
new file mode 100644
index 000000000000..65b3e022eaf5
--- /dev/null
+++ b/fs/ceph/decode.h
@@ -0,0 +1,194 @@
1#ifndef __CEPH_DECODE_H
2#define __CEPH_DECODE_H
3
4#include <asm/unaligned.h>
5#include <linux/time.h>
6
7#include "types.h"
8
9/*
10 * in all cases,
11 * void **p pointer to position pointer
12 * void *end pointer to end of buffer (last byte + 1)
13 */
14
15static inline u64 ceph_decode_64(void **p)
16{
17 u64 v = get_unaligned_le64(*p);
18 *p += sizeof(u64);
19 return v;
20}
21static inline u32 ceph_decode_32(void **p)
22{
23 u32 v = get_unaligned_le32(*p);
24 *p += sizeof(u32);
25 return v;
26}
27static inline u16 ceph_decode_16(void **p)
28{
29 u16 v = get_unaligned_le16(*p);
30 *p += sizeof(u16);
31 return v;
32}
33static inline u8 ceph_decode_8(void **p)
34{
35 u8 v = *(u8 *)*p;
36 (*p)++;
37 return v;
38}
39static inline void ceph_decode_copy(void **p, void *pv, size_t n)
40{
41 memcpy(pv, *p, n);
42 *p += n;
43}
44
45/*
46 * bounds check input.
47 */
48#define ceph_decode_need(p, end, n, bad) \
49 do { \
50 if (unlikely(*(p) + (n) > (end))) \
51 goto bad; \
52 } while (0)
53
54#define ceph_decode_64_safe(p, end, v, bad) \
55 do { \
56 ceph_decode_need(p, end, sizeof(u64), bad); \
57 v = ceph_decode_64(p); \
58 } while (0)
59#define ceph_decode_32_safe(p, end, v, bad) \
60 do { \
61 ceph_decode_need(p, end, sizeof(u32), bad); \
62 v = ceph_decode_32(p); \
63 } while (0)
64#define ceph_decode_16_safe(p, end, v, bad) \
65 do { \
66 ceph_decode_need(p, end, sizeof(u16), bad); \
67 v = ceph_decode_16(p); \
68 } while (0)
69#define ceph_decode_8_safe(p, end, v, bad) \
70 do { \
71 ceph_decode_need(p, end, sizeof(u8), bad); \
72 v = ceph_decode_8(p); \
73 } while (0)
74
75#define ceph_decode_copy_safe(p, end, pv, n, bad) \
76 do { \
77 ceph_decode_need(p, end, n, bad); \
78 ceph_decode_copy(p, pv, n); \
79 } while (0)
80
81/*
82 * struct ceph_timespec <-> struct timespec
83 */
84static inline void ceph_decode_timespec(struct timespec *ts,
85 const struct ceph_timespec *tv)
86{
87 ts->tv_sec = le32_to_cpu(tv->tv_sec);
88 ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
89}
90static inline void ceph_encode_timespec(struct ceph_timespec *tv,
91 const struct timespec *ts)
92{
93 tv->tv_sec = cpu_to_le32(ts->tv_sec);
94 tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
95}
96
97/*
98 * sockaddr_storage <-> ceph_sockaddr
99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{
102 a->in_addr.ss_family = htons(a->in_addr.ss_family);
103}
104static inline void ceph_decode_addr(struct ceph_entity_addr *a)
105{
106 a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
107 WARN_ON(a->in_addr.ss_family == 512);
108}
109
110/*
111 * encoders
112 */
113static inline void ceph_encode_64(void **p, u64 v)
114{
115 put_unaligned_le64(v, (__le64 *)*p);
116 *p += sizeof(u64);
117}
118static inline void ceph_encode_32(void **p, u32 v)
119{
120 put_unaligned_le32(v, (__le32 *)*p);
121 *p += sizeof(u32);
122}
123static inline void ceph_encode_16(void **p, u16 v)
124{
125 put_unaligned_le16(v, (__le16 *)*p);
126 *p += sizeof(u16);
127}
128static inline void ceph_encode_8(void **p, u8 v)
129{
130 *(u8 *)*p = v;
131 (*p)++;
132}
133static inline void ceph_encode_copy(void **p, const void *s, int len)
134{
135 memcpy(*p, s, len);
136 *p += len;
137}
138
139/*
140 * filepath, string encoders
141 */
142static inline void ceph_encode_filepath(void **p, void *end,
143 u64 ino, const char *path)
144{
145 u32 len = path ? strlen(path) : 0;
146 BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
147 ceph_encode_8(p, 1);
148 ceph_encode_64(p, ino);
149 ceph_encode_32(p, len);
150 if (len)
151 memcpy(*p, path, len);
152 *p += len;
153}
154
155static inline void ceph_encode_string(void **p, void *end,
156 const char *s, u32 len)
157{
158 BUG_ON(*p + sizeof(len) + len > end);
159 ceph_encode_32(p, len);
160 if (len)
161 memcpy(*p, s, len);
162 *p += len;
163}
164
165#define ceph_encode_need(p, end, n, bad) \
166 do { \
167 if (unlikely(*(p) + (n) > (end))) \
168 goto bad; \
169 } while (0)
170
171#define ceph_encode_64_safe(p, end, v, bad) \
172 do { \
173 ceph_encode_need(p, end, sizeof(u64), bad); \
174 ceph_encode_64(p, v); \
175 } while (0)
176#define ceph_encode_32_safe(p, end, v, bad) \
177 do { \
178 ceph_encode_need(p, end, sizeof(u32), bad); \
179 ceph_encode_32(p, v); \
180 } while (0)
181#define ceph_encode_16_safe(p, end, v, bad) \
182 do { \
183 ceph_encode_need(p, end, sizeof(u16), bad); \
184 ceph_encode_16(p, v); \
185 } while (0)
186
187#define ceph_encode_copy_safe(p, end, pv, n, bad) \
188 do { \
189 ceph_encode_need(p, end, n, bad); \
190 ceph_encode_copy(p, pv, n); \
191 } while (0)
192
193
194#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 000000000000..ea8ee2e526aa
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1224 @@
1#include "ceph_debug.h"
2
3#include <linux/spinlock.h>
4#include <linux/fs_struct.h>
5#include <linux/namei.h>
6#include <linux/slab.h>
7#include <linux/sched.h>
8
9#include "super.h"
10
11/*
12 * Directory operations: readdir, lookup, create, link, unlink,
13 * rename, etc.
14 */
15
16/*
17 * Ceph MDS operations are specified in terms of a base ino and
18 * relative path. Thus, the client can specify an operation on a
19 * specific inode (e.g., a getattr due to fstat(2)), or as a path
20 * relative to, say, the root directory.
21 *
22 * Normally, we limit ourselves to strict inode ops (no path component)
23 * or dentry operations (a single path component relative to an ino). The
24 * exception to this is open_root_dentry(), which will open the mount
25 * point by name.
26 */
27
28const struct inode_operations ceph_dir_iops;
29const struct file_operations ceph_dir_fops;
30struct dentry_operations ceph_dentry_ops;
31
32/*
33 * Initialize ceph dentry state.
34 */
35int ceph_init_dentry(struct dentry *dentry)
36{
37 struct ceph_dentry_info *di;
38
39 if (dentry->d_fsdata)
40 return 0;
41
42 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
43 dentry->d_op = &ceph_dentry_ops;
44 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
45 dentry->d_op = &ceph_snapdir_dentry_ops;
46 else
47 dentry->d_op = &ceph_snap_dentry_ops;
48
49 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
50 if (!di)
51 return -ENOMEM; /* oh well */
52
53 spin_lock(&dentry->d_lock);
54 if (dentry->d_fsdata) /* lost a race */
55 goto out_unlock;
56 di->dentry = dentry;
57 di->lease_session = NULL;
58 dentry->d_fsdata = di;
59 dentry->d_time = jiffies;
60 ceph_dentry_lru_add(dentry);
61out_unlock:
62 spin_unlock(&dentry->d_lock);
63 return 0;
64}
65
66
67
68/*
69 * for readdir, we encode the directory frag and offset within that
70 * frag into f_pos.
71 */
72static unsigned fpos_frag(loff_t p)
73{
74 return p >> 32;
75}
76static unsigned fpos_off(loff_t p)
77{
78 return p & 0xffffffff;
79}
80
81/*
82 * When possible, we try to satisfy a readdir by peeking at the
83 * dcache. We make this work by carefully ordering dentries on
84 * d_u.d_child when we initially get results back from the MDS, and
85 * falling back to a "normal" sync readdir if any dentries in the dir
86 * are dropped.
87 *
88 * I_COMPLETE tells indicates we have all dentries in the dir. It is
89 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
90 * the MDS if/when the directory is modified).
91 */
92static int __dcache_readdir(struct file *filp,
93 void *dirent, filldir_t filldir)
94{
95 struct inode *inode = filp->f_dentry->d_inode;
96 struct ceph_file_info *fi = filp->private_data;
97 struct dentry *parent = filp->f_dentry;
98 struct inode *dir = parent->d_inode;
99 struct list_head *p;
100 struct dentry *dentry, *last;
101 struct ceph_dentry_info *di;
102 int err = 0;
103
104 /* claim ref on last dentry we returned */
105 last = fi->dentry;
106 fi->dentry = NULL;
107
108 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
109 last);
110
111 spin_lock(&dcache_lock);
112
113 /* start at beginning? */
114 if (filp->f_pos == 2 || (last &&
115 filp->f_pos < ceph_dentry(last)->offset)) {
116 if (list_empty(&parent->d_subdirs))
117 goto out_unlock;
118 p = parent->d_subdirs.prev;
119 dout(" initial p %p/%p\n", p->prev, p->next);
120 } else {
121 p = last->d_u.d_child.prev;
122 }
123
124more:
125 dentry = list_entry(p, struct dentry, d_u.d_child);
126 di = ceph_dentry(dentry);
127 while (1) {
128 dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
129 parent->d_subdirs.prev, parent->d_subdirs.next);
130 if (p == &parent->d_subdirs) {
131 fi->at_end = 1;
132 goto out_unlock;
133 }
134 if (!d_unhashed(dentry) && dentry->d_inode &&
135 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
136 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
137 filp->f_pos <= di->offset)
138 break;
139 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
140 dentry->d_name.len, dentry->d_name.name, di->offset,
141 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
142 !dentry->d_inode ? " null" : "");
143 p = p->prev;
144 dentry = list_entry(p, struct dentry, d_u.d_child);
145 di = ceph_dentry(dentry);
146 }
147
148 atomic_inc(&dentry->d_count);
149 spin_unlock(&dcache_lock);
150 spin_unlock(&inode->i_lock);
151
152 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
153 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
154 filp->f_pos = di->offset;
155 err = filldir(dirent, dentry->d_name.name,
156 dentry->d_name.len, di->offset,
157 dentry->d_inode->i_ino,
158 dentry->d_inode->i_mode >> 12);
159
160 if (last) {
161 if (err < 0) {
162 /* remember our position */
163 fi->dentry = last;
164 fi->next_offset = di->offset;
165 } else {
166 dput(last);
167 }
168 last = NULL;
169 }
170
171 spin_lock(&inode->i_lock);
172 spin_lock(&dcache_lock);
173
174 last = dentry;
175
176 if (err < 0)
177 goto out_unlock;
178
179 p = p->prev;
180 filp->f_pos++;
181
182 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
183 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
184 goto more;
185 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
186 err = -EAGAIN;
187
188out_unlock:
189 spin_unlock(&dcache_lock);
190
191 if (last) {
192 spin_unlock(&inode->i_lock);
193 dput(last);
194 spin_lock(&inode->i_lock);
195 }
196
197 return err;
198}
199
200/*
201 * make note of the last dentry we read, so we can
202 * continue at the same lexicographical point,
203 * regardless of what dir changes take place on the
204 * server.
205 */
206static int note_last_dentry(struct ceph_file_info *fi, const char *name,
207 int len)
208{
209 kfree(fi->last_name);
210 fi->last_name = kmalloc(len+1, GFP_NOFS);
211 if (!fi->last_name)
212 return -ENOMEM;
213 memcpy(fi->last_name, name, len);
214 fi->last_name[len] = 0;
215 dout("note_last_dentry '%s'\n", fi->last_name);
216 return 0;
217}
218
219static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
220{
221 struct ceph_file_info *fi = filp->private_data;
222 struct inode *inode = filp->f_dentry->d_inode;
223 struct ceph_inode_info *ci = ceph_inode(inode);
224 struct ceph_client *client = ceph_inode_to_client(inode);
225 struct ceph_mds_client *mdsc = &client->mdsc;
226 unsigned frag = fpos_frag(filp->f_pos);
227 int off = fpos_off(filp->f_pos);
228 int err;
229 u32 ftype;
230 struct ceph_mds_reply_info_parsed *rinfo;
231 const int max_entries = client->mount_args->max_readdir;
232
233 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
234 if (fi->at_end)
235 return 0;
236
237 /* always start with . and .. */
238 if (filp->f_pos == 0) {
239 /* note dir version at start of readdir so we can tell
240 * if any dentries get dropped */
241 fi->dir_release_count = ci->i_release_count;
242
243 dout("readdir off 0 -> '.'\n");
244 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
245 inode->i_ino, inode->i_mode >> 12) < 0)
246 return 0;
247 filp->f_pos = 1;
248 off = 1;
249 }
250 if (filp->f_pos == 1) {
251 dout("readdir off 1 -> '..'\n");
252 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
253 filp->f_dentry->d_parent->d_inode->i_ino,
254 inode->i_mode >> 12) < 0)
255 return 0;
256 filp->f_pos = 2;
257 off = 2;
258 }
259
260 /* can we use the dcache? */
261 spin_lock(&inode->i_lock);
262 if ((filp->f_pos == 2 || fi->dentry) &&
263 !ceph_test_opt(client, NOASYNCREADDIR) &&
264 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
265 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
266 err = __dcache_readdir(filp, dirent, filldir);
267 if (err != -EAGAIN) {
268 spin_unlock(&inode->i_lock);
269 return err;
270 }
271 }
272 spin_unlock(&inode->i_lock);
273 if (fi->dentry) {
274 err = note_last_dentry(fi, fi->dentry->d_name.name,
275 fi->dentry->d_name.len);
276 if (err)
277 return err;
278 dput(fi->dentry);
279 fi->dentry = NULL;
280 }
281
282 /* proceed with a normal readdir */
283
284more:
285 /* do we have the correct frag content buffered? */
286 if (fi->frag != frag || fi->last_readdir == NULL) {
287 struct ceph_mds_request *req;
288 int op = ceph_snap(inode) == CEPH_SNAPDIR ?
289 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
290
291 /* discard old result, if any */
292 if (fi->last_readdir) {
293 ceph_mdsc_put_request(fi->last_readdir);
294 fi->last_readdir = NULL;
295 }
296
297 /* requery frag tree, as the frag topology may have changed */
298 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
299
300 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
301 ceph_vinop(inode), frag, fi->last_name);
302 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
303 if (IS_ERR(req))
304 return PTR_ERR(req);
305 req->r_inode = igrab(inode);
306 req->r_dentry = dget(filp->f_dentry);
307 /* hints to request -> mds selection code */
308 req->r_direct_mode = USE_AUTH_MDS;
309 req->r_direct_hash = ceph_frag_value(frag);
310 req->r_direct_is_hash = true;
311 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
312 req->r_readdir_offset = fi->next_offset;
313 req->r_args.readdir.frag = cpu_to_le32(frag);
314 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
315 req->r_num_caps = max_entries + 1;
316 err = ceph_mdsc_do_request(mdsc, NULL, req);
317 if (err < 0) {
318 ceph_mdsc_put_request(req);
319 return err;
320 }
321 dout("readdir got and parsed readdir result=%d"
322 " on frag %x, end=%d, complete=%d\n", err, frag,
323 (int)req->r_reply_info.dir_end,
324 (int)req->r_reply_info.dir_complete);
325
326 if (!req->r_did_prepopulate) {
327 dout("readdir !did_prepopulate");
328 fi->dir_release_count--; /* preclude I_COMPLETE */
329 }
330
331 /* note next offset and last dentry name */
332 fi->offset = fi->next_offset;
333 fi->last_readdir = req;
334
335 if (req->r_reply_info.dir_end) {
336 kfree(fi->last_name);
337 fi->last_name = NULL;
338 fi->next_offset = 0;
339 } else {
340 rinfo = &req->r_reply_info;
341 err = note_last_dentry(fi,
342 rinfo->dir_dname[rinfo->dir_nr-1],
343 rinfo->dir_dname_len[rinfo->dir_nr-1]);
344 if (err)
345 return err;
346 fi->next_offset += rinfo->dir_nr;
347 }
348 }
349
350 rinfo = &fi->last_readdir->r_reply_info;
351 dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
352 rinfo->dir_nr, off, fi->offset);
353 while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
354 u64 pos = ceph_make_fpos(frag, off);
355 struct ceph_mds_reply_inode *in =
356 rinfo->dir_in[off - fi->offset].in;
357 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
358 off, off - fi->offset, rinfo->dir_nr, pos,
359 rinfo->dir_dname_len[off - fi->offset],
360 rinfo->dir_dname[off - fi->offset], in);
361 BUG_ON(!in);
362 ftype = le32_to_cpu(in->mode) >> 12;
363 if (filldir(dirent,
364 rinfo->dir_dname[off - fi->offset],
365 rinfo->dir_dname_len[off - fi->offset],
366 pos,
367 le64_to_cpu(in->ino),
368 ftype) < 0) {
369 dout("filldir stopping us...\n");
370 return 0;
371 }
372 off++;
373 filp->f_pos = pos + 1;
374 }
375
376 if (fi->last_name) {
377 ceph_mdsc_put_request(fi->last_readdir);
378 fi->last_readdir = NULL;
379 goto more;
380 }
381
382 /* more frags? */
383 if (!ceph_frag_is_rightmost(frag)) {
384 frag = ceph_frag_next(frag);
385 off = 0;
386 filp->f_pos = ceph_make_fpos(frag, off);
387 dout("readdir next frag is %x\n", frag);
388 goto more;
389 }
390 fi->at_end = 1;
391
392 /*
393 * if dir_release_count still matches the dir, no dentries
394 * were released during the whole readdir, and we should have
395 * the complete dir contents in our cache.
396 */
397 spin_lock(&inode->i_lock);
398 if (ci->i_release_count == fi->dir_release_count) {
399 dout(" marking %p complete\n", inode);
400 ci->i_ceph_flags |= CEPH_I_COMPLETE;
401 ci->i_max_offset = filp->f_pos;
402 }
403 spin_unlock(&inode->i_lock);
404
405 dout("readdir %p filp %p done.\n", inode, filp);
406 return 0;
407}
408
409static void reset_readdir(struct ceph_file_info *fi)
410{
411 if (fi->last_readdir) {
412 ceph_mdsc_put_request(fi->last_readdir);
413 fi->last_readdir = NULL;
414 }
415 kfree(fi->last_name);
416 fi->next_offset = 2; /* compensate for . and .. */
417 if (fi->dentry) {
418 dput(fi->dentry);
419 fi->dentry = NULL;
420 }
421 fi->at_end = 0;
422}
423
424static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
425{
426 struct ceph_file_info *fi = file->private_data;
427 struct inode *inode = file->f_mapping->host;
428 loff_t old_offset = offset;
429 loff_t retval;
430
431 mutex_lock(&inode->i_mutex);
432 switch (origin) {
433 case SEEK_END:
434 offset += inode->i_size + 2; /* FIXME */
435 break;
436 case SEEK_CUR:
437 offset += file->f_pos;
438 }
439 retval = -EINVAL;
440 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
441 if (offset != file->f_pos) {
442 file->f_pos = offset;
443 file->f_version = 0;
444 fi->at_end = 0;
445 }
446 retval = offset;
447
448 /*
449 * discard buffered readdir content on seekdir(0), or
450 * seek to new frag, or seek prior to current chunk.
451 */
452 if (offset == 0 ||
453 fpos_frag(offset) != fpos_frag(old_offset) ||
454 fpos_off(offset) < fi->offset) {
455 dout("dir_llseek dropping %p content\n", file);
456 reset_readdir(fi);
457 }
458
459 /* bump dir_release_count if we did a forward seek */
460 if (offset > old_offset)
461 fi->dir_release_count--;
462 }
463 mutex_unlock(&inode->i_mutex);
464 return retval;
465}
466
467/*
468 * Process result of a lookup/open request.
469 *
470 * Mainly, make sure we return the final req->r_dentry (if it already
471 * existed) in place of the original VFS-provided dentry when they
472 * differ.
473 *
474 * Gracefully handle the case where the MDS replies with -ENOENT and
475 * no trace (which it may do, at its discretion, e.g., if it doesn't
476 * care to issue a lease on the negative dentry).
477 */
478struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
479 struct dentry *dentry, int err)
480{
481 struct ceph_client *client = ceph_client(dentry->d_sb);
482 struct inode *parent = dentry->d_parent->d_inode;
483
484 /* .snap dir? */
485 if (err == -ENOENT &&
486 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
487 strcmp(dentry->d_name.name,
488 client->mount_args->snapdir_name) == 0) {
489 struct inode *inode = ceph_get_snapdir(parent);
490 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
491 dentry, dentry->d_name.len, dentry->d_name.name, inode);
492 BUG_ON(!d_unhashed(dentry));
493 d_add(dentry, inode);
494 err = 0;
495 }
496
497 if (err == -ENOENT) {
498 /* no trace? */
499 err = 0;
500 if (!req->r_reply_info.head->is_dentry) {
501 dout("ENOENT and no trace, dentry %p inode %p\n",
502 dentry, dentry->d_inode);
503 if (dentry->d_inode) {
504 d_drop(dentry);
505 err = -ENOENT;
506 } else {
507 d_add(dentry, NULL);
508 }
509 }
510 }
511 if (err)
512 dentry = ERR_PTR(err);
513 else if (dentry != req->r_dentry)
514 dentry = dget(req->r_dentry); /* we got spliced */
515 else
516 dentry = NULL;
517 return dentry;
518}
519
520static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
521{
522 return ceph_ino(inode) == CEPH_INO_ROOT &&
523 strncmp(dentry->d_name.name, ".ceph", 5) == 0;
524}
525
526/*
527 * Look up a single dir entry. If there is a lookup intent, inform
528 * the MDS so that it gets our 'caps wanted' value in a single op.
529 */
530static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
531 struct nameidata *nd)
532{
533 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
534 struct ceph_mds_client *mdsc = &client->mdsc;
535 struct ceph_mds_request *req;
536 int op;
537 int err;
538
539 dout("lookup %p dentry %p '%.*s'\n",
540 dir, dentry, dentry->d_name.len, dentry->d_name.name);
541
542 if (dentry->d_name.len > NAME_MAX)
543 return ERR_PTR(-ENAMETOOLONG);
544
545 err = ceph_init_dentry(dentry);
546 if (err < 0)
547 return ERR_PTR(err);
548
549 /* open (but not create!) intent? */
550 if (nd &&
551 (nd->flags & LOOKUP_OPEN) &&
552 (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
553 !(nd->intent.open.flags & O_CREAT)) {
554 int mode = nd->intent.open.create_mode & ~current->fs->umask;
555 return ceph_lookup_open(dir, dentry, nd, mode, 1);
556 }
557
558 /* can we conclude ENOENT locally? */
559 if (dentry->d_inode == NULL) {
560 struct ceph_inode_info *ci = ceph_inode(dir);
561 struct ceph_dentry_info *di = ceph_dentry(dentry);
562
563 spin_lock(&dir->i_lock);
564 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
565 if (strncmp(dentry->d_name.name,
566 client->mount_args->snapdir_name,
567 dentry->d_name.len) &&
568 !is_root_ceph_dentry(dir, dentry) &&
569 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
570 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
571 di->offset = ci->i_max_offset++;
572 spin_unlock(&dir->i_lock);
573 dout(" dir %p complete, -ENOENT\n", dir);
574 d_add(dentry, NULL);
575 di->lease_shared_gen = ci->i_shared_gen;
576 return NULL;
577 }
578 spin_unlock(&dir->i_lock);
579 }
580
581 op = ceph_snap(dir) == CEPH_SNAPDIR ?
582 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
583 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
584 if (IS_ERR(req))
585 return ERR_PTR(PTR_ERR(req));
586 req->r_dentry = dget(dentry);
587 req->r_num_caps = 2;
588 /* we only need inode linkage */
589 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
590 req->r_locked_dir = dir;
591 err = ceph_mdsc_do_request(mdsc, NULL, req);
592 dentry = ceph_finish_lookup(req, dentry, err);
593 ceph_mdsc_put_request(req); /* will dput(dentry) */
594 dout("lookup result=%p\n", dentry);
595 return dentry;
596}
597
598/*
599 * If we do a create but get no trace back from the MDS, follow up with
600 * a lookup (the VFS expects us to link up the provided dentry).
601 */
602int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
603{
604 struct dentry *result = ceph_lookup(dir, dentry, NULL);
605
606 if (result && !IS_ERR(result)) {
607 /*
608 * We created the item, then did a lookup, and found
609 * it was already linked to another inode we already
610 * had in our cache (and thus got spliced). Link our
611 * dentry to that inode, but don't hash it, just in
612 * case the VFS wants to dereference it.
613 */
614 BUG_ON(!result->d_inode);
615 d_instantiate(dentry, result->d_inode);
616 return 0;
617 }
618 return PTR_ERR(result);
619}
620
621static int ceph_mknod(struct inode *dir, struct dentry *dentry,
622 int mode, dev_t rdev)
623{
624 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
625 struct ceph_mds_client *mdsc = &client->mdsc;
626 struct ceph_mds_request *req;
627 int err;
628
629 if (ceph_snap(dir) != CEPH_NOSNAP)
630 return -EROFS;
631
632 dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
633 dir, dentry, mode, rdev);
634 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
635 if (IS_ERR(req)) {
636 d_drop(dentry);
637 return PTR_ERR(req);
638 }
639 req->r_dentry = dget(dentry);
640 req->r_num_caps = 2;
641 req->r_locked_dir = dir;
642 req->r_args.mknod.mode = cpu_to_le32(mode);
643 req->r_args.mknod.rdev = cpu_to_le32(rdev);
644 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
645 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
646 err = ceph_mdsc_do_request(mdsc, dir, req);
647 if (!err && !req->r_reply_info.head->is_dentry)
648 err = ceph_handle_notrace_create(dir, dentry);
649 ceph_mdsc_put_request(req);
650 if (err)
651 d_drop(dentry);
652 return err;
653}
654
655static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
656 struct nameidata *nd)
657{
658 dout("create in dir %p dentry %p name '%.*s'\n",
659 dir, dentry, dentry->d_name.len, dentry->d_name.name);
660
661 if (ceph_snap(dir) != CEPH_NOSNAP)
662 return -EROFS;
663
664 if (nd) {
665 BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
666 dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
667 /* hrm, what should i do here if we get aliased? */
668 if (IS_ERR(dentry))
669 return PTR_ERR(dentry);
670 return 0;
671 }
672
673 /* fall back to mknod */
674 return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
675}
676
677static int ceph_symlink(struct inode *dir, struct dentry *dentry,
678 const char *dest)
679{
680 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
681 struct ceph_mds_client *mdsc = &client->mdsc;
682 struct ceph_mds_request *req;
683 int err;
684
685 if (ceph_snap(dir) != CEPH_NOSNAP)
686 return -EROFS;
687
688 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
689 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
690 if (IS_ERR(req)) {
691 d_drop(dentry);
692 return PTR_ERR(req);
693 }
694 req->r_dentry = dget(dentry);
695 req->r_num_caps = 2;
696 req->r_path2 = kstrdup(dest, GFP_NOFS);
697 req->r_locked_dir = dir;
698 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
699 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
700 err = ceph_mdsc_do_request(mdsc, dir, req);
701 if (!err && !req->r_reply_info.head->is_dentry)
702 err = ceph_handle_notrace_create(dir, dentry);
703 ceph_mdsc_put_request(req);
704 if (err)
705 d_drop(dentry);
706 return err;
707}
708
709static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
710{
711 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
712 struct ceph_mds_client *mdsc = &client->mdsc;
713 struct ceph_mds_request *req;
714 int err = -EROFS;
715 int op;
716
717 if (ceph_snap(dir) == CEPH_SNAPDIR) {
718 /* mkdir .snap/foo is a MKSNAP */
719 op = CEPH_MDS_OP_MKSNAP;
720 dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
721 dentry->d_name.len, dentry->d_name.name, dentry);
722 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
723 dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
724 op = CEPH_MDS_OP_MKDIR;
725 } else {
726 goto out;
727 }
728 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
729 if (IS_ERR(req)) {
730 err = PTR_ERR(req);
731 goto out;
732 }
733
734 req->r_dentry = dget(dentry);
735 req->r_num_caps = 2;
736 req->r_locked_dir = dir;
737 req->r_args.mkdir.mode = cpu_to_le32(mode);
738 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
739 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
740 err = ceph_mdsc_do_request(mdsc, dir, req);
741 if (!err && !req->r_reply_info.head->is_dentry)
742 err = ceph_handle_notrace_create(dir, dentry);
743 ceph_mdsc_put_request(req);
744out:
745 if (err < 0)
746 d_drop(dentry);
747 return err;
748}
749
750static int ceph_link(struct dentry *old_dentry, struct inode *dir,
751 struct dentry *dentry)
752{
753 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
754 struct ceph_mds_client *mdsc = &client->mdsc;
755 struct ceph_mds_request *req;
756 int err;
757
758 if (ceph_snap(dir) != CEPH_NOSNAP)
759 return -EROFS;
760
761 dout("link in dir %p old_dentry %p dentry %p\n", dir,
762 old_dentry, dentry);
763 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
764 if (IS_ERR(req)) {
765 d_drop(dentry);
766 return PTR_ERR(req);
767 }
768 req->r_dentry = dget(dentry);
769 req->r_num_caps = 2;
770 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
771 req->r_locked_dir = dir;
772 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
773 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
774 err = ceph_mdsc_do_request(mdsc, dir, req);
775 if (err)
776 d_drop(dentry);
777 else if (!req->r_reply_info.head->is_dentry)
778 d_instantiate(dentry, igrab(old_dentry->d_inode));
779 ceph_mdsc_put_request(req);
780 return err;
781}
782
783/*
784 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
785 * looks like the link count will hit 0, drop any other caps (other
786 * than PIN) we don't specifically want (due to the file still being
787 * open).
788 */
789static int drop_caps_for_unlink(struct inode *inode)
790{
791 struct ceph_inode_info *ci = ceph_inode(inode);
792 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
793
794 spin_lock(&inode->i_lock);
795 if (inode->i_nlink == 1) {
796 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
797 ci->i_ceph_flags |= CEPH_I_NODELAY;
798 }
799 spin_unlock(&inode->i_lock);
800 return drop;
801}
802
803/*
804 * rmdir and unlink are differ only by the metadata op code
805 */
806static int ceph_unlink(struct inode *dir, struct dentry *dentry)
807{
808 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
809 struct ceph_mds_client *mdsc = &client->mdsc;
810 struct inode *inode = dentry->d_inode;
811 struct ceph_mds_request *req;
812 int err = -EROFS;
813 int op;
814
815 if (ceph_snap(dir) == CEPH_SNAPDIR) {
816 /* rmdir .snap/foo is RMSNAP */
817 dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
818 dentry->d_name.name, dentry);
819 op = CEPH_MDS_OP_RMSNAP;
820 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
821 dout("unlink/rmdir dir %p dn %p inode %p\n",
822 dir, dentry, inode);
823 op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
824 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
825 } else
826 goto out;
827 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
828 if (IS_ERR(req)) {
829 err = PTR_ERR(req);
830 goto out;
831 }
832 req->r_dentry = dget(dentry);
833 req->r_num_caps = 2;
834 req->r_locked_dir = dir;
835 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
836 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
837 req->r_inode_drop = drop_caps_for_unlink(inode);
838 err = ceph_mdsc_do_request(mdsc, dir, req);
839 if (!err && !req->r_reply_info.head->is_dentry)
840 d_delete(dentry);
841 ceph_mdsc_put_request(req);
842out:
843 return err;
844}
845
846static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
847 struct inode *new_dir, struct dentry *new_dentry)
848{
849 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
850 struct ceph_mds_client *mdsc = &client->mdsc;
851 struct ceph_mds_request *req;
852 int err;
853
854 if (ceph_snap(old_dir) != ceph_snap(new_dir))
855 return -EXDEV;
856 if (ceph_snap(old_dir) != CEPH_NOSNAP ||
857 ceph_snap(new_dir) != CEPH_NOSNAP)
858 return -EROFS;
859 dout("rename dir %p dentry %p to dir %p dentry %p\n",
860 old_dir, old_dentry, new_dir, new_dentry);
861 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
862 if (IS_ERR(req))
863 return PTR_ERR(req);
864 req->r_dentry = dget(new_dentry);
865 req->r_num_caps = 2;
866 req->r_old_dentry = dget(old_dentry);
867 req->r_locked_dir = new_dir;
868 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
869 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
870 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
871 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
872 /* release LINK_RDCACHE on source inode (mds will lock it) */
873 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
874 if (new_dentry->d_inode)
875 req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
876 err = ceph_mdsc_do_request(mdsc, old_dir, req);
877 if (!err && !req->r_reply_info.head->is_dentry) {
878 /*
879 * Normally d_move() is done by fill_trace (called by
880 * do_request, above). If there is no trace, we need
881 * to do it here.
882 */
883 d_move(old_dentry, new_dentry);
884 }
885 ceph_mdsc_put_request(req);
886 return err;
887}
888
889
890/*
891 * Check if dentry lease is valid. If not, delete the lease. Try to
892 * renew if the least is more than half up.
893 */
894static int dentry_lease_is_valid(struct dentry *dentry)
895{
896 struct ceph_dentry_info *di;
897 struct ceph_mds_session *s;
898 int valid = 0;
899 u32 gen;
900 unsigned long ttl;
901 struct ceph_mds_session *session = NULL;
902 struct inode *dir = NULL;
903 u32 seq = 0;
904
905 spin_lock(&dentry->d_lock);
906 di = ceph_dentry(dentry);
907 if (di && di->lease_session) {
908 s = di->lease_session;
909 spin_lock(&s->s_cap_lock);
910 gen = s->s_cap_gen;
911 ttl = s->s_cap_ttl;
912 spin_unlock(&s->s_cap_lock);
913
914 if (di->lease_gen == gen &&
915 time_before(jiffies, dentry->d_time) &&
916 time_before(jiffies, ttl)) {
917 valid = 1;
918 if (di->lease_renew_after &&
919 time_after(jiffies, di->lease_renew_after)) {
920 /* we should renew */
921 dir = dentry->d_parent->d_inode;
922 session = ceph_get_mds_session(s);
923 seq = di->lease_seq;
924 di->lease_renew_after = 0;
925 di->lease_renew_from = jiffies;
926 }
927 }
928 }
929 spin_unlock(&dentry->d_lock);
930
931 if (session) {
932 ceph_mdsc_lease_send_msg(session, dir, dentry,
933 CEPH_MDS_LEASE_RENEW, seq);
934 ceph_put_mds_session(session);
935 }
936 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
937 return valid;
938}
939
940/*
941 * Check if directory-wide content lease/cap is valid.
942 */
943static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
944{
945 struct ceph_inode_info *ci = ceph_inode(dir);
946 struct ceph_dentry_info *di = ceph_dentry(dentry);
947 int valid = 0;
948
949 spin_lock(&dir->i_lock);
950 if (ci->i_shared_gen == di->lease_shared_gen)
951 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
952 spin_unlock(&dir->i_lock);
953 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
954 dir, (unsigned)ci->i_shared_gen, dentry,
955 (unsigned)di->lease_shared_gen, valid);
956 return valid;
957}
958
959/*
960 * Check if cached dentry can be trusted.
961 */
962static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
963{
964 struct inode *dir = dentry->d_parent->d_inode;
965
966 dout("d_revalidate %p '%.*s' inode %p\n", dentry,
967 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
968
969 /* always trust cached snapped dentries, snapdir dentry */
970 if (ceph_snap(dir) != CEPH_NOSNAP) {
971 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
972 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
973 goto out_touch;
974 }
975 if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
976 goto out_touch;
977
978 if (dentry_lease_is_valid(dentry) ||
979 dir_lease_is_valid(dir, dentry))
980 goto out_touch;
981
982 dout("d_revalidate %p invalid\n", dentry);
983 d_drop(dentry);
984 return 0;
985out_touch:
986 ceph_dentry_lru_touch(dentry);
987 return 1;
988}
989
990/*
991 * When a dentry is released, clear the dir I_COMPLETE if it was part
992 * of the current dir gen.
993 */
994static void ceph_dentry_release(struct dentry *dentry)
995{
996 struct ceph_dentry_info *di = ceph_dentry(dentry);
997 struct inode *parent_inode = dentry->d_parent->d_inode;
998
999 if (parent_inode) {
1000 struct ceph_inode_info *ci = ceph_inode(parent_inode);
1001
1002 spin_lock(&parent_inode->i_lock);
1003 if (ci->i_shared_gen == di->lease_shared_gen) {
1004 dout(" clearing %p complete (d_release)\n",
1005 parent_inode);
1006 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1007 ci->i_release_count++;
1008 }
1009 spin_unlock(&parent_inode->i_lock);
1010 }
1011 if (di) {
1012 ceph_dentry_lru_del(dentry);
1013 if (di->lease_session)
1014 ceph_put_mds_session(di->lease_session);
1015 kmem_cache_free(ceph_dentry_cachep, di);
1016 dentry->d_fsdata = NULL;
1017 }
1018}
1019
1020static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1021 struct nameidata *nd)
1022{
1023 /*
1024 * Eventually, we'll want to revalidate snapped metadata
1025 * too... probably...
1026 */
1027 return 1;
1028}
1029
1030
1031
1032/*
1033 * read() on a dir. This weird interface hack only works if mounted
1034 * with '-o dirstat'.
1035 */
1036static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1037 loff_t *ppos)
1038{
1039 struct ceph_file_info *cf = file->private_data;
1040 struct inode *inode = file->f_dentry->d_inode;
1041 struct ceph_inode_info *ci = ceph_inode(inode);
1042 int left;
1043
1044 if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
1045 return -EISDIR;
1046
1047 if (!cf->dir_info) {
1048 cf->dir_info = kmalloc(1024, GFP_NOFS);
1049 if (!cf->dir_info)
1050 return -ENOMEM;
1051 cf->dir_info_len =
1052 sprintf(cf->dir_info,
1053 "entries: %20lld\n"
1054 " files: %20lld\n"
1055 " subdirs: %20lld\n"
1056 "rentries: %20lld\n"
1057 " rfiles: %20lld\n"
1058 " rsubdirs: %20lld\n"
1059 "rbytes: %20lld\n"
1060 "rctime: %10ld.%09ld\n",
1061 ci->i_files + ci->i_subdirs,
1062 ci->i_files,
1063 ci->i_subdirs,
1064 ci->i_rfiles + ci->i_rsubdirs,
1065 ci->i_rfiles,
1066 ci->i_rsubdirs,
1067 ci->i_rbytes,
1068 (long)ci->i_rctime.tv_sec,
1069 (long)ci->i_rctime.tv_nsec);
1070 }
1071
1072 if (*ppos >= cf->dir_info_len)
1073 return 0;
1074 size = min_t(unsigned, size, cf->dir_info_len-*ppos);
1075 left = copy_to_user(buf, cf->dir_info + *ppos, size);
1076 if (left == size)
1077 return -EFAULT;
1078 *ppos += (size - left);
1079 return size - left;
1080}
1081
1082/*
1083 * an fsync() on a dir will wait for any uncommitted directory
1084 * operations to commit.
1085 */
1086static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
1087 int datasync)
1088{
1089 struct inode *inode = dentry->d_inode;
1090 struct ceph_inode_info *ci = ceph_inode(inode);
1091 struct list_head *head = &ci->i_unsafe_dirops;
1092 struct ceph_mds_request *req;
1093 u64 last_tid;
1094 int ret = 0;
1095
1096 dout("dir_fsync %p\n", inode);
1097 spin_lock(&ci->i_unsafe_lock);
1098 if (list_empty(head))
1099 goto out;
1100
1101 req = list_entry(head->prev,
1102 struct ceph_mds_request, r_unsafe_dir_item);
1103 last_tid = req->r_tid;
1104
1105 do {
1106 ceph_mdsc_get_request(req);
1107 spin_unlock(&ci->i_unsafe_lock);
1108 dout("dir_fsync %p wait on tid %llu (until %llu)\n",
1109 inode, req->r_tid, last_tid);
1110 if (req->r_timeout) {
1111 ret = wait_for_completion_timeout(
1112 &req->r_safe_completion, req->r_timeout);
1113 if (ret > 0)
1114 ret = 0;
1115 else if (ret == 0)
1116 ret = -EIO; /* timed out */
1117 } else {
1118 wait_for_completion(&req->r_safe_completion);
1119 }
1120 spin_lock(&ci->i_unsafe_lock);
1121 ceph_mdsc_put_request(req);
1122
1123 if (ret || list_empty(head))
1124 break;
1125 req = list_entry(head->next,
1126 struct ceph_mds_request, r_unsafe_dir_item);
1127 } while (req->r_tid < last_tid);
1128out:
1129 spin_unlock(&ci->i_unsafe_lock);
1130 return ret;
1131}
1132
1133/*
1134 * We maintain a private dentry LRU.
1135 *
1136 * FIXME: this needs to be changed to a per-mds lru to be useful.
1137 */
1138void ceph_dentry_lru_add(struct dentry *dn)
1139{
1140 struct ceph_dentry_info *di = ceph_dentry(dn);
1141 struct ceph_mds_client *mdsc;
1142
1143 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1144 dn->d_name.len, dn->d_name.name);
1145 if (di) {
1146 mdsc = &ceph_client(dn->d_sb)->mdsc;
1147 spin_lock(&mdsc->dentry_lru_lock);
1148 list_add_tail(&di->lru, &mdsc->dentry_lru);
1149 mdsc->num_dentry++;
1150 spin_unlock(&mdsc->dentry_lru_lock);
1151 }
1152}
1153
1154void ceph_dentry_lru_touch(struct dentry *dn)
1155{
1156 struct ceph_dentry_info *di = ceph_dentry(dn);
1157 struct ceph_mds_client *mdsc;
1158
1159 dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
1160 dn->d_name.len, dn->d_name.name);
1161 if (di) {
1162 mdsc = &ceph_client(dn->d_sb)->mdsc;
1163 spin_lock(&mdsc->dentry_lru_lock);
1164 list_move_tail(&di->lru, &mdsc->dentry_lru);
1165 spin_unlock(&mdsc->dentry_lru_lock);
1166 }
1167}
1168
1169void ceph_dentry_lru_del(struct dentry *dn)
1170{
1171 struct ceph_dentry_info *di = ceph_dentry(dn);
1172 struct ceph_mds_client *mdsc;
1173
1174 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1175 dn->d_name.len, dn->d_name.name);
1176 if (di) {
1177 mdsc = &ceph_client(dn->d_sb)->mdsc;
1178 spin_lock(&mdsc->dentry_lru_lock);
1179 list_del_init(&di->lru);
1180 mdsc->num_dentry--;
1181 spin_unlock(&mdsc->dentry_lru_lock);
1182 }
1183}
1184
1185const struct file_operations ceph_dir_fops = {
1186 .read = ceph_read_dir,
1187 .readdir = ceph_readdir,
1188 .llseek = ceph_dir_llseek,
1189 .open = ceph_open,
1190 .release = ceph_release,
1191 .unlocked_ioctl = ceph_ioctl,
1192 .fsync = ceph_dir_fsync,
1193};
1194
1195const struct inode_operations ceph_dir_iops = {
1196 .lookup = ceph_lookup,
1197 .permission = ceph_permission,
1198 .getattr = ceph_getattr,
1199 .setattr = ceph_setattr,
1200 .setxattr = ceph_setxattr,
1201 .getxattr = ceph_getxattr,
1202 .listxattr = ceph_listxattr,
1203 .removexattr = ceph_removexattr,
1204 .mknod = ceph_mknod,
1205 .symlink = ceph_symlink,
1206 .mkdir = ceph_mkdir,
1207 .link = ceph_link,
1208 .unlink = ceph_unlink,
1209 .rmdir = ceph_unlink,
1210 .rename = ceph_rename,
1211 .create = ceph_create,
1212};
1213
1214struct dentry_operations ceph_dentry_ops = {
1215 .d_revalidate = ceph_d_revalidate,
1216 .d_release = ceph_dentry_release,
1217};
1218
1219struct dentry_operations ceph_snapdir_dentry_ops = {
1220 .d_revalidate = ceph_snapdir_d_revalidate,
1221};
1222
1223struct dentry_operations ceph_snap_dentry_ops = {
1224};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 000000000000..9d67572fb328
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,224 @@
1#include "ceph_debug.h"
2
3#include <linux/exportfs.h>
4#include <linux/slab.h>
5#include <asm/unaligned.h>
6
7#include "super.h"
8
9/*
10 * NFS export support
11 *
12 * NFS re-export of a ceph mount is, at present, only semireliable.
13 * The basic issue is that the Ceph architectures doesn't lend itself
14 * well to generating filehandles that will remain valid forever.
15 *
16 * So, we do our best. If you're lucky, your inode will be in the
17 * client's cache. If it's not, and you have a connectable fh, then
18 * the MDS server may be able to find it for you. Otherwise, you get
19 * ESTALE.
20 *
21 * There are ways to this more reliable, but in the non-connectable fh
22 * case, we won't every work perfectly, and in the connectable case,
23 * some changes are needed on the MDS side to work better.
24 */
25
26/*
27 * Basic fh
28 */
29struct ceph_nfs_fh {
30 u64 ino;
31} __attribute__ ((packed));
32
33/*
34 * Larger 'connectable' fh that includes parent ino and name hash.
35 * Use this whenever possible, as it works more reliably.
36 */
37struct ceph_nfs_confh {
38 u64 ino, parent_ino;
39 u32 parent_name_hash;
40} __attribute__ ((packed));
41
42static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
43 int connectable)
44{
45 struct ceph_nfs_fh *fh = (void *)rawfh;
46 struct ceph_nfs_confh *cfh = (void *)rawfh;
47 struct dentry *parent = dentry->d_parent;
48 struct inode *inode = dentry->d_inode;
49 int type;
50
51 /* don't re-export snaps */
52 if (ceph_snap(inode) != CEPH_NOSNAP)
53 return -EINVAL;
54
55 if (*max_len >= sizeof(*cfh)) {
56 dout("encode_fh %p connectable\n", dentry);
57 cfh->ino = ceph_ino(dentry->d_inode);
58 cfh->parent_ino = ceph_ino(parent->d_inode);
59 cfh->parent_name_hash = parent->d_name.hash;
60 *max_len = sizeof(*cfh);
61 type = 2;
62 } else if (*max_len > sizeof(*fh)) {
63 if (connectable)
64 return -ENOSPC;
65 dout("encode_fh %p\n", dentry);
66 fh->ino = ceph_ino(dentry->d_inode);
67 *max_len = sizeof(*fh);
68 type = 1;
69 } else {
70 return -ENOSPC;
71 }
72 return type;
73}
74
75/*
76 * convert regular fh to dentry
77 *
78 * FIXME: we should try harder by querying the mds for the ino.
79 */
80static struct dentry *__fh_to_dentry(struct super_block *sb,
81 struct ceph_nfs_fh *fh)
82{
83 struct inode *inode;
84 struct dentry *dentry;
85 struct ceph_vino vino;
86 int err;
87
88 dout("__fh_to_dentry %llx\n", fh->ino);
89 vino.ino = fh->ino;
90 vino.snap = CEPH_NOSNAP;
91 inode = ceph_find_inode(sb, vino);
92 if (!inode)
93 return ERR_PTR(-ESTALE);
94
95 dentry = d_obtain_alias(inode);
96 if (!dentry) {
97 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
98 fh->ino, inode);
99 iput(inode);
100 return ERR_PTR(-ENOMEM);
101 }
102 err = ceph_init_dentry(dentry);
103
104 if (err < 0) {
105 iput(inode);
106 return ERR_PTR(err);
107 }
108 dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
109 return dentry;
110}
111
112/*
113 * convert connectable fh to dentry
114 */
115static struct dentry *__cfh_to_dentry(struct super_block *sb,
116 struct ceph_nfs_confh *cfh)
117{
118 struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
119 struct inode *inode;
120 struct dentry *dentry;
121 struct ceph_vino vino;
122 int err;
123
124 dout("__cfh_to_dentry %llx (%llx/%x)\n",
125 cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
126
127 vino.ino = cfh->ino;
128 vino.snap = CEPH_NOSNAP;
129 inode = ceph_find_inode(sb, vino);
130 if (!inode) {
131 struct ceph_mds_request *req;
132
133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
134 USE_ANY_MDS);
135 if (IS_ERR(req))
136 return ERR_PTR(PTR_ERR(req));
137
138 req->r_ino1 = vino;
139 req->r_ino2.ino = cfh->parent_ino;
140 req->r_ino2.snap = CEPH_NOSNAP;
141 req->r_path2 = kmalloc(16, GFP_NOFS);
142 snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
143 req->r_num_caps = 1;
144 err = ceph_mdsc_do_request(mdsc, NULL, req);
145 ceph_mdsc_put_request(req);
146 inode = ceph_find_inode(sb, vino);
147 if (!inode)
148 return ERR_PTR(err ? err : -ESTALE);
149 }
150
151 dentry = d_obtain_alias(inode);
152 if (!dentry) {
153 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
154 cfh->ino, inode);
155 iput(inode);
156 return ERR_PTR(-ENOMEM);
157 }
158 err = ceph_init_dentry(dentry);
159 if (err < 0) {
160 iput(inode);
161 return ERR_PTR(err);
162 }
163 dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
164 return dentry;
165}
166
167static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
168 int fh_len, int fh_type)
169{
170 if (fh_type == 1)
171 return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
172 else
173 return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
174}
175
176/*
177 * get parent, if possible.
178 *
179 * FIXME: we could do better by querying the mds to discover the
180 * parent.
181 */
182static struct dentry *ceph_fh_to_parent(struct super_block *sb,
183 struct fid *fid,
184 int fh_len, int fh_type)
185{
186 struct ceph_nfs_confh *cfh = (void *)fid->raw;
187 struct ceph_vino vino;
188 struct inode *inode;
189 struct dentry *dentry;
190 int err;
191
192 if (fh_type == 1)
193 return ERR_PTR(-ESTALE);
194
195 pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
196 cfh->parent_name_hash);
197
198 vino.ino = cfh->ino;
199 vino.snap = CEPH_NOSNAP;
200 inode = ceph_find_inode(sb, vino);
201 if (!inode)
202 return ERR_PTR(-ESTALE);
203
204 dentry = d_obtain_alias(inode);
205 if (!dentry) {
206 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
207 cfh->ino, inode);
208 iput(inode);
209 return ERR_PTR(-ENOMEM);
210 }
211 err = ceph_init_dentry(dentry);
212 if (err < 0) {
213 iput(inode);
214 return ERR_PTR(err);
215 }
216 dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
217 return dentry;
218}
219
220const struct export_operations ceph_export_ops = {
221 .encode_fh = ceph_encode_fh,
222 .fh_to_dentry = ceph_fh_to_dentry,
223 .fh_to_parent = ceph_fh_to_parent,
224};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 000000000000..4add3d5da2c1
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,938 @@
1#include "ceph_debug.h"
2
3#include <linux/sched.h>
4#include <linux/slab.h>
5#include <linux/file.h>
6#include <linux/namei.h>
7#include <linux/writeback.h>
8
9#include "super.h"
10#include "mds_client.h"
11
12/*
13 * Ceph file operations
14 *
15 * Implement basic open/close functionality, and implement
16 * read/write.
17 *
18 * We implement three modes of file I/O:
19 * - buffered uses the generic_file_aio_{read,write} helpers
20 *
21 * - synchronous is used when there is multi-client read/write
22 * sharing, avoids the page cache, and synchronously waits for an
23 * ack from the OSD.
24 *
25 * - direct io takes the variant of the sync path that references
26 * user pages directly.
27 *
28 * fsync() flushes and waits on dirty pages, but just queues metadata
29 * for writeback: since the MDS can recover size and mtime there is no
30 * need to wait for MDS acknowledgement.
31 */
32
33
34/*
35 * Prepare an open request. Preallocate ceph_cap to avoid an
36 * inopportune ENOMEM later.
37 */
38static struct ceph_mds_request *
39prepare_open_request(struct super_block *sb, int flags, int create_mode)
40{
41 struct ceph_client *client = ceph_sb_to_client(sb);
42 struct ceph_mds_client *mdsc = &client->mdsc;
43 struct ceph_mds_request *req;
44 int want_auth = USE_ANY_MDS;
45 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
46
47 if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
48 want_auth = USE_AUTH_MDS;
49
50 req = ceph_mdsc_create_request(mdsc, op, want_auth);
51 if (IS_ERR(req))
52 goto out;
53 req->r_fmode = ceph_flags_to_mode(flags);
54 req->r_args.open.flags = cpu_to_le32(flags);
55 req->r_args.open.mode = cpu_to_le32(create_mode);
56 req->r_args.open.preferred = cpu_to_le32(-1);
57out:
58 return req;
59}
60
61/*
62 * initialize private struct file data.
63 * if we fail, clean up by dropping fmode reference on the ceph_inode
64 */
65static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
66{
67 struct ceph_file_info *cf;
68 int ret = 0;
69
70 switch (inode->i_mode & S_IFMT) {
71 case S_IFREG:
72 case S_IFDIR:
73 dout("init_file %p %p 0%o (regular)\n", inode, file,
74 inode->i_mode);
75 cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
76 if (cf == NULL) {
77 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
78 return -ENOMEM;
79 }
80 cf->fmode = fmode;
81 cf->next_offset = 2;
82 file->private_data = cf;
83 BUG_ON(inode->i_fop->release != ceph_release);
84 break;
85
86 case S_IFLNK:
87 dout("init_file %p %p 0%o (symlink)\n", inode, file,
88 inode->i_mode);
89 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
90 break;
91
92 default:
93 dout("init_file %p %p 0%o (special)\n", inode, file,
94 inode->i_mode);
95 /*
96 * we need to drop the open ref now, since we don't
97 * have .release set to ceph_release.
98 */
99 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
100 BUG_ON(inode->i_fop->release == ceph_release);
101
102 /* call the proper open fop */
103 ret = inode->i_fop->open(inode, file);
104 }
105 return ret;
106}
107
108/*
109 * If the filp already has private_data, that means the file was
110 * already opened by intent during lookup, and we do nothing.
111 *
112 * If we already have the requisite capabilities, we can satisfy
113 * the open request locally (no need to request new caps from the
114 * MDS). We do, however, need to inform the MDS (asynchronously)
115 * if our wanted caps set expands.
116 */
117int ceph_open(struct inode *inode, struct file *file)
118{
119 struct ceph_inode_info *ci = ceph_inode(inode);
120 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
121 struct ceph_mds_client *mdsc = &client->mdsc;
122 struct ceph_mds_request *req;
123 struct ceph_file_info *cf = file->private_data;
124 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
125 int err;
126 int flags, fmode, wanted;
127
128 if (cf) {
129 dout("open file %p is already opened\n", file);
130 return 0;
131 }
132
133 /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
134 flags = file->f_flags & ~(O_CREAT|O_EXCL);
135 if (S_ISDIR(inode->i_mode))
136 flags = O_DIRECTORY; /* mds likes to know */
137
138 dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
139 ceph_vinop(inode), file, flags, file->f_flags);
140 fmode = ceph_flags_to_mode(flags);
141 wanted = ceph_caps_for_mode(fmode);
142
143 /* snapped files are read-only */
144 if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
145 return -EROFS;
146
147 /* trivially open snapdir */
148 if (ceph_snap(inode) == CEPH_SNAPDIR) {
149 spin_lock(&inode->i_lock);
150 __ceph_get_fmode(ci, fmode);
151 spin_unlock(&inode->i_lock);
152 return ceph_init_file(inode, file, fmode);
153 }
154
155 /*
156 * No need to block if we have any caps. Update wanted set
157 * asynchronously.
158 */
159 spin_lock(&inode->i_lock);
160 if (__ceph_is_any_real_caps(ci)) {
161 int mds_wanted = __ceph_caps_mds_wanted(ci);
162 int issued = __ceph_caps_issued(ci, NULL);
163
164 dout("open %p fmode %d want %s issued %s using existing\n",
165 inode, fmode, ceph_cap_string(wanted),
166 ceph_cap_string(issued));
167 __ceph_get_fmode(ci, fmode);
168 spin_unlock(&inode->i_lock);
169
170 /* adjust wanted? */
171 if ((issued & wanted) != wanted &&
172 (mds_wanted & wanted) != wanted &&
173 ceph_snap(inode) != CEPH_SNAPDIR)
174 ceph_check_caps(ci, 0, NULL);
175
176 return ceph_init_file(inode, file, fmode);
177 } else if (ceph_snap(inode) != CEPH_NOSNAP &&
178 (ci->i_snap_caps & wanted) == wanted) {
179 __ceph_get_fmode(ci, fmode);
180 spin_unlock(&inode->i_lock);
181 return ceph_init_file(inode, file, fmode);
182 }
183 spin_unlock(&inode->i_lock);
184
185 dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
186 req = prepare_open_request(inode->i_sb, flags, 0);
187 if (IS_ERR(req)) {
188 err = PTR_ERR(req);
189 goto out;
190 }
191 req->r_inode = igrab(inode);
192 req->r_num_caps = 1;
193 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
194 if (!err)
195 err = ceph_init_file(inode, file, req->r_fmode);
196 ceph_mdsc_put_request(req);
197 dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
198out:
199 return err;
200}
201
202
203/*
204 * Do a lookup + open with a single request.
205 *
206 * If this succeeds, but some subsequent check in the vfs
207 * may_open() fails, the struct *file gets cleaned up (i.e.
208 * ceph_release gets called). So fear not!
209 */
210/*
211 * flags
212 * path_lookup_open -> LOOKUP_OPEN
213 * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
214 */
215struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
216 struct nameidata *nd, int mode,
217 int locked_dir)
218{
219 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
220 struct ceph_mds_client *mdsc = &client->mdsc;
221 struct file *file = nd->intent.open.file;
222 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
223 struct ceph_mds_request *req;
224 int err;
225 int flags = nd->intent.open.flags - 1; /* silly vfs! */
226
227 dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
228 dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
229
230 /* do the open */
231 req = prepare_open_request(dir->i_sb, flags, mode);
232 if (IS_ERR(req))
233 return ERR_PTR(PTR_ERR(req));
234 req->r_dentry = dget(dentry);
235 req->r_num_caps = 2;
236 if (flags & O_CREAT) {
237 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
238 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
239 }
240 req->r_locked_dir = dir; /* caller holds dir->i_mutex */
241 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
242 dentry = ceph_finish_lookup(req, dentry, err);
243 if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
244 err = ceph_handle_notrace_create(dir, dentry);
245 if (!err)
246 err = ceph_init_file(req->r_dentry->d_inode, file,
247 req->r_fmode);
248 ceph_mdsc_put_request(req);
249 dout("ceph_lookup_open result=%p\n", dentry);
250 return dentry;
251}
252
253int ceph_release(struct inode *inode, struct file *file)
254{
255 struct ceph_inode_info *ci = ceph_inode(inode);
256 struct ceph_file_info *cf = file->private_data;
257
258 dout("release inode %p file %p\n", inode, file);
259 ceph_put_fmode(ci, cf->fmode);
260 if (cf->last_readdir)
261 ceph_mdsc_put_request(cf->last_readdir);
262 kfree(cf->last_name);
263 kfree(cf->dir_info);
264 dput(cf->dentry);
265 kmem_cache_free(ceph_file_cachep, cf);
266
267 /* wake up anyone waiting for caps on this inode */
268 wake_up(&ci->i_cap_wq);
269 return 0;
270}
271
272/*
273 * build a vector of user pages
274 */
275static struct page **get_direct_page_vector(const char __user *data,
276 int num_pages,
277 loff_t off, size_t len)
278{
279 struct page **pages;
280 int rc;
281
282 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
283 if (!pages)
284 return ERR_PTR(-ENOMEM);
285
286 down_read(&current->mm->mmap_sem);
287 rc = get_user_pages(current, current->mm, (unsigned long)data,
288 num_pages, 0, 0, pages, NULL);
289 up_read(&current->mm->mmap_sem);
290 if (rc < 0)
291 goto fail;
292 return pages;
293
294fail:
295 kfree(pages);
296 return ERR_PTR(rc);
297}
298
299static void put_page_vector(struct page **pages, int num_pages)
300{
301 int i;
302
303 for (i = 0; i < num_pages; i++)
304 put_page(pages[i]);
305 kfree(pages);
306}
307
308void ceph_release_page_vector(struct page **pages, int num_pages)
309{
310 int i;
311
312 for (i = 0; i < num_pages; i++)
313 __free_pages(pages[i], 0);
314 kfree(pages);
315}
316
317/*
318 * allocate a vector new pages
319 */
320static struct page **alloc_page_vector(int num_pages)
321{
322 struct page **pages;
323 int i;
324
325 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
326 if (!pages)
327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) {
329 pages[i] = alloc_page(GFP_NOFS);
330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM);
333 }
334 }
335 return pages;
336}
337
338/*
339 * copy user data into a page vector
340 */
341static int copy_user_to_page_vector(struct page **pages,
342 const char __user *data,
343 loff_t off, size_t len)
344{
345 int i = 0;
346 int po = off & ~PAGE_CACHE_MASK;
347 int left = len;
348 int l, bad;
349
350 while (left > 0) {
351 l = min_t(int, PAGE_CACHE_SIZE-po, left);
352 bad = copy_from_user(page_address(pages[i]) + po, data, l);
353 if (bad == l)
354 return -EFAULT;
355 data += l - bad;
356 left -= l - bad;
357 po += l - bad;
358 if (po == PAGE_CACHE_SIZE) {
359 po = 0;
360 i++;
361 }
362 }
363 return len;
364}
365
366/*
367 * copy user data from a page vector into a user pointer
368 */
369static int copy_page_vector_to_user(struct page **pages, char __user *data,
370 loff_t off, size_t len)
371{
372 int i = 0;
373 int po = off & ~PAGE_CACHE_MASK;
374 int left = len;
375 int l, bad;
376
377 while (left > 0) {
378 l = min_t(int, left, PAGE_CACHE_SIZE-po);
379 bad = copy_to_user(data, page_address(pages[i]) + po, l);
380 if (bad == l)
381 return -EFAULT;
382 data += l - bad;
383 left -= l - bad;
384 if (po) {
385 po += l - bad;
386 if (po == PAGE_CACHE_SIZE)
387 po = 0;
388 }
389 i++;
390 }
391 return len;
392}
393
394/*
395 * Zero an extent within a page vector. Offset is relative to the
396 * start of the first page.
397 */
398static void zero_page_vector_range(int off, int len, struct page **pages)
399{
400 int i = off >> PAGE_CACHE_SHIFT;
401
402 off &= ~PAGE_CACHE_MASK;
403
404 dout("zero_page_vector_page %u~%u\n", off, len);
405
406 /* leading partial page? */
407 if (off) {
408 int end = min((int)PAGE_CACHE_SIZE, off + len);
409 dout("zeroing %d %p head from %d\n", i, pages[i],
410 (int)off);
411 zero_user_segment(pages[i], off, end);
412 len -= (end - off);
413 i++;
414 }
415 while (len >= PAGE_CACHE_SIZE) {
416 dout("zeroing %d %p len=%d\n", i, pages[i], len);
417 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
418 len -= PAGE_CACHE_SIZE;
419 i++;
420 }
421 /* trailing partial page? */
422 if (len) {
423 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
424 zero_user_segment(pages[i], 0, len);
425 }
426}
427
428
429/*
430 * Read a range of bytes striped over one or more objects. Iterate over
431 * objects we stripe over. (That's not atomic, but good enough for now.)
432 *
433 * If we get a short result from the OSD, check against i_size; we need to
434 * only return a short read to the caller if we hit EOF.
435 */
436static int striped_read(struct inode *inode,
437 u64 off, u64 len,
438 struct page **pages, int num_pages,
439 int *checkeof)
440{
441 struct ceph_client *client = ceph_inode_to_client(inode);
442 struct ceph_inode_info *ci = ceph_inode(inode);
443 u64 pos, this_len;
444 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
445 int left, pages_left;
446 int read;
447 struct page **page_pos;
448 int ret;
449 bool hit_stripe, was_short;
450
451 /*
452 * we may need to do multiple reads. not atomic, unfortunately.
453 */
454 pos = off;
455 left = len;
456 page_pos = pages;
457 pages_left = num_pages;
458 read = 0;
459
460more:
461 this_len = left;
462 ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
463 &ci->i_layout, pos, &this_len,
464 ci->i_truncate_seq,
465 ci->i_truncate_size,
466 page_pos, pages_left);
467 hit_stripe = this_len < left;
468 was_short = ret >= 0 && ret < this_len;
469 if (ret == -ENOENT)
470 ret = 0;
471 dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
472 ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
473
474 if (ret > 0) {
475 int didpages =
476 ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
477
478 if (read < pos - off) {
479 dout(" zero gap %llu to %llu\n", off + read, pos);
480 zero_page_vector_range(page_off + read,
481 pos - off - read, pages);
482 }
483 pos += ret;
484 read = pos - off;
485 left -= ret;
486 page_pos += didpages;
487 pages_left -= didpages;
488
489 /* hit stripe? */
490 if (left && hit_stripe)
491 goto more;
492 }
493
494 if (was_short) {
495 /* was original extent fully inside i_size? */
496 if (pos + left <= inode->i_size) {
497 dout("zero tail\n");
498 zero_page_vector_range(page_off + read, len - read,
499 pages);
500 read = len;
501 goto out;
502 }
503
504 /* check i_size */
505 *checkeof = 1;
506 }
507
508out:
509 if (ret >= 0)
510 ret = read;
511 dout("striped_read returns %d\n", ret);
512 return ret;
513}
514
515/*
516 * Completely synchronous read and write methods. Direct from __user
517 * buffer to osd, or directly to user pages (if O_DIRECT).
518 *
519 * If the read spans object boundary, just do multiple reads.
520 */
521static ssize_t ceph_sync_read(struct file *file, char __user *data,
522 unsigned len, loff_t *poff, int *checkeof)
523{
524 struct inode *inode = file->f_dentry->d_inode;
525 struct page **pages;
526 u64 off = *poff;
527 int num_pages = calc_pages_for(off, len);
528 int ret;
529
530 dout("sync_read on file %p %llu~%u %s\n", file, off, len,
531 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
532
533 if (file->f_flags & O_DIRECT) {
534 pages = get_direct_page_vector(data, num_pages, off, len);
535
536 /*
537 * flush any page cache pages in this range. this
538 * will make concurrent normal and O_DIRECT io slow,
539 * but it will at least behave sensibly when they are
540 * in sequence.
541 */
542 } else {
543 pages = alloc_page_vector(num_pages);
544 }
545 if (IS_ERR(pages))
546 return PTR_ERR(pages);
547
548 ret = filemap_write_and_wait(inode->i_mapping);
549 if (ret < 0)
550 goto done;
551
552 ret = striped_read(inode, off, len, pages, num_pages, checkeof);
553
554 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
555 ret = copy_page_vector_to_user(pages, data, off, ret);
556 if (ret >= 0)
557 *poff = off + ret;
558
559done:
560 if (file->f_flags & O_DIRECT)
561 put_page_vector(pages, num_pages);
562 else
563 ceph_release_page_vector(pages, num_pages);
564 dout("sync_read result %d\n", ret);
565 return ret;
566}
567
568/*
569 * Write commit callback, called if we requested both an ACK and
570 * ONDISK commit reply from the OSD.
571 */
572static void sync_write_commit(struct ceph_osd_request *req,
573 struct ceph_msg *msg)
574{
575 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
576
577 dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
578 spin_lock(&ci->i_unsafe_lock);
579 list_del_init(&req->r_unsafe_item);
580 spin_unlock(&ci->i_unsafe_lock);
581 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
582}
583
584/*
585 * Synchronous write, straight from __user pointer or user pages (if
586 * O_DIRECT).
587 *
588 * If write spans object boundary, just do multiple writes. (For a
589 * correct atomic write, we should e.g. take write locks on all
590 * objects, rollback on failure, etc.)
591 */
592static ssize_t ceph_sync_write(struct file *file, const char __user *data,
593 size_t left, loff_t *offset)
594{
595 struct inode *inode = file->f_dentry->d_inode;
596 struct ceph_inode_info *ci = ceph_inode(inode);
597 struct ceph_client *client = ceph_inode_to_client(inode);
598 struct ceph_osd_request *req;
599 struct page **pages;
600 int num_pages;
601 long long unsigned pos;
602 u64 len;
603 int written = 0;
604 int flags;
605 int do_sync = 0;
606 int check_caps = 0;
607 int ret;
608 struct timespec mtime = CURRENT_TIME;
609
610 if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
611 return -EROFS;
612
613 dout("sync_write on file %p %lld~%u %s\n", file, *offset,
614 (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
615
616 if (file->f_flags & O_APPEND)
617 pos = i_size_read(inode);
618 else
619 pos = *offset;
620
621 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
622 if (ret < 0)
623 return ret;
624
625 ret = invalidate_inode_pages2_range(inode->i_mapping,
626 pos >> PAGE_CACHE_SHIFT,
627 (pos + left) >> PAGE_CACHE_SHIFT);
628 if (ret < 0)
629 dout("invalidate_inode_pages2_range returned %d\n", ret);
630
631 flags = CEPH_OSD_FLAG_ORDERSNAP |
632 CEPH_OSD_FLAG_ONDISK |
633 CEPH_OSD_FLAG_WRITE;
634 if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
635 flags |= CEPH_OSD_FLAG_ACK;
636 else
637 do_sync = 1;
638
639 /*
640 * we may need to do multiple writes here if we span an object
641 * boundary. this isn't atomic, unfortunately. :(
642 */
643more:
644 len = left;
645 req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
646 ceph_vino(inode), pos, &len,
647 CEPH_OSD_OP_WRITE, flags,
648 ci->i_snap_realm->cached_context,
649 do_sync,
650 ci->i_truncate_seq, ci->i_truncate_size,
651 &mtime, false, 2);
652 if (IS_ERR(req))
653 return PTR_ERR(req);
654
655 num_pages = calc_pages_for(pos, len);
656
657 if (file->f_flags & O_DIRECT) {
658 pages = get_direct_page_vector(data, num_pages, pos, len);
659 if (IS_ERR(pages)) {
660 ret = PTR_ERR(pages);
661 goto out;
662 }
663
664 /*
665 * throw out any page cache pages in this range. this
666 * may block.
667 */
668 truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
669 } else {
670 pages = alloc_page_vector(num_pages);
671 if (IS_ERR(pages)) {
672 ret = PTR_ERR(pages);
673 goto out;
674 }
675 ret = copy_user_to_page_vector(pages, data, pos, len);
676 if (ret < 0) {
677 ceph_release_page_vector(pages, num_pages);
678 goto out;
679 }
680
681 if ((file->f_flags & O_SYNC) == 0) {
682 /* get a second commit callback */
683 req->r_safe_callback = sync_write_commit;
684 req->r_own_pages = 1;
685 }
686 }
687 req->r_pages = pages;
688 req->r_num_pages = num_pages;
689 req->r_inode = inode;
690
691 ret = ceph_osdc_start_request(&client->osdc, req, false);
692 if (!ret) {
693 if (req->r_safe_callback) {
694 /*
695 * Add to inode unsafe list only after we
696 * start_request so that a tid has been assigned.
697 */
698 spin_lock(&ci->i_unsafe_lock);
699 list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
700 spin_unlock(&ci->i_unsafe_lock);
701 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
702 }
703 ret = ceph_osdc_wait_request(&client->osdc, req);
704 }
705
706 if (file->f_flags & O_DIRECT)
707 put_page_vector(pages, num_pages);
708 else if (file->f_flags & O_SYNC)
709 ceph_release_page_vector(pages, num_pages);
710
711out:
712 ceph_osdc_put_request(req);
713 if (ret == 0) {
714 pos += len;
715 written += len;
716 left -= len;
717 if (left)
718 goto more;
719
720 ret = written;
721 *offset = pos;
722 if (pos > i_size_read(inode))
723 check_caps = ceph_inode_set_size(inode, pos);
724 if (check_caps)
725 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
726 NULL);
727 }
728 return ret;
729}
730
731/*
732 * Wrap generic_file_aio_read with checks for cap bits on the inode.
733 * Atomically grab references, so that those bits are not released
734 * back to the MDS mid-read.
735 *
736 * Hmm, the sync read case isn't actually async... should it be?
737 */
738static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
739 unsigned long nr_segs, loff_t pos)
740{
741 struct file *filp = iocb->ki_filp;
742 loff_t *ppos = &iocb->ki_pos;
743 size_t len = iov->iov_len;
744 struct inode *inode = filp->f_dentry->d_inode;
745 struct ceph_inode_info *ci = ceph_inode(inode);
746 void *base = iov->iov_base;
747 ssize_t ret;
748 int got = 0;
749 int checkeof = 0, read = 0;
750
751 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
752 inode, ceph_vinop(inode), pos, (unsigned)len, inode);
753again:
754 __ceph_do_pending_vmtruncate(inode);
755 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
756 &got, -1);
757 if (ret < 0)
758 goto out;
759 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
760 inode, ceph_vinop(inode), pos, (unsigned)len,
761 ceph_cap_string(got));
762
763 if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
764 (iocb->ki_filp->f_flags & O_DIRECT) ||
765 (inode->i_sb->s_flags & MS_SYNCHRONOUS))
766 /* hmm, this isn't really async... */
767 ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
768 else
769 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
770
771out:
772 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
773 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
774 ceph_put_cap_refs(ci, got);
775
776 if (checkeof && ret >= 0) {
777 int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
778
779 /* hit EOF or hole? */
780 if (statret == 0 && *ppos < inode->i_size) {
781 dout("aio_read sync_read hit hole, reading more\n");
782 read += ret;
783 base += ret;
784 len -= ret;
785 checkeof = 0;
786 goto again;
787 }
788 }
789 if (ret >= 0)
790 ret += read;
791
792 return ret;
793}
794
795/*
796 * Take cap references to avoid releasing caps to MDS mid-write.
797 *
798 * If we are synchronous, and write with an old snap context, the OSD
799 * may return EOLDSNAPC. In that case, retry the write.. _after_
800 * dropping our cap refs and allowing the pending snap to logically
801 * complete _before_ this write occurs.
802 *
803 * If we are near ENOSPC, write synchronously.
804 */
805static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
806 unsigned long nr_segs, loff_t pos)
807{
808 struct file *file = iocb->ki_filp;
809 struct inode *inode = file->f_dentry->d_inode;
810 struct ceph_inode_info *ci = ceph_inode(inode);
811 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
812 loff_t endoff = pos + iov->iov_len;
813 int got = 0;
814 int ret, err;
815
816 if (ceph_snap(inode) != CEPH_NOSNAP)
817 return -EROFS;
818
819retry_snap:
820 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
821 return -ENOSPC;
822 __ceph_do_pending_vmtruncate(inode);
823 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
824 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
825 inode->i_size);
826 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
827 &got, endoff);
828 if (ret < 0)
829 goto out;
830
831 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
832 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
833 ceph_cap_string(got));
834
835 if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
836 (iocb->ki_filp->f_flags & O_DIRECT) ||
837 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
838 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
839 &iocb->ki_pos);
840 } else {
841 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
842
843 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
844 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
845 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
846 err = vfs_fsync_range(file, file->f_path.dentry,
847 pos, pos + ret - 1, 1);
848 if (err < 0)
849 ret = err;
850 }
851 }
852 if (ret >= 0) {
853 spin_lock(&inode->i_lock);
854 __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
855 spin_unlock(&inode->i_lock);
856 }
857
858out:
859 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
860 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
861 ceph_cap_string(got));
862 ceph_put_cap_refs(ci, got);
863
864 if (ret == -EOLDSNAPC) {
865 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
866 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
867 goto retry_snap;
868 }
869
870 return ret;
871}
872
873/*
874 * llseek. be sure to verify file size on SEEK_END.
875 */
876static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
877{
878 struct inode *inode = file->f_mapping->host;
879 int ret;
880
881 mutex_lock(&inode->i_mutex);
882 __ceph_do_pending_vmtruncate(inode);
883 switch (origin) {
884 case SEEK_END:
885 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
886 if (ret < 0) {
887 offset = ret;
888 goto out;
889 }
890 offset += inode->i_size;
891 break;
892 case SEEK_CUR:
893 /*
894 * Here we special-case the lseek(fd, 0, SEEK_CUR)
895 * position-querying operation. Avoid rewriting the "same"
896 * f_pos value back to the file because a concurrent read(),
897 * write() or lseek() might have altered it
898 */
899 if (offset == 0) {
900 offset = file->f_pos;
901 goto out;
902 }
903 offset += file->f_pos;
904 break;
905 }
906
907 if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
908 offset = -EINVAL;
909 goto out;
910 }
911
912 /* Special lock needed here? */
913 if (offset != file->f_pos) {
914 file->f_pos = offset;
915 file->f_version = 0;
916 }
917
918out:
919 mutex_unlock(&inode->i_mutex);
920 return offset;
921}
922
923const struct file_operations ceph_file_fops = {
924 .open = ceph_open,
925 .release = ceph_release,
926 .llseek = ceph_llseek,
927 .read = do_sync_read,
928 .write = do_sync_write,
929 .aio_read = ceph_aio_read,
930 .aio_write = ceph_aio_write,
931 .mmap = ceph_mmap,
932 .fsync = ceph_fsync,
933 .splice_read = generic_file_splice_read,
934 .splice_write = generic_file_splice_write,
935 .unlocked_ioctl = ceph_ioctl,
936 .compat_ioctl = ceph_ioctl,
937};
938
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 000000000000..26f883c275e8
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1774 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/fs.h>
5#include <linux/smp_lock.h>
6#include <linux/slab.h>
7#include <linux/string.h>
8#include <linux/uaccess.h>
9#include <linux/kernel.h>
10#include <linux/namei.h>
11#include <linux/writeback.h>
12#include <linux/vmalloc.h>
13#include <linux/pagevec.h>
14
15#include "super.h"
16#include "decode.h"
17
18/*
19 * Ceph inode operations
20 *
21 * Implement basic inode helpers (get, alloc) and inode ops (getattr,
22 * setattr, etc.), xattr helpers, and helpers for assimilating
23 * metadata returned by the MDS into our cache.
24 *
25 * Also define helpers for doing asynchronous writeback, invalidation,
26 * and truncation for the benefit of those who can't afford to block
27 * (typically because they are in the message handler path).
28 */
29
30static const struct inode_operations ceph_symlink_iops;
31
32static void ceph_invalidate_work(struct work_struct *work);
33static void ceph_writeback_work(struct work_struct *work);
34static void ceph_vmtruncate_work(struct work_struct *work);
35
36/*
37 * find or create an inode, given the ceph ino number
38 */
39struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
40{
41 struct inode *inode;
42 ino_t t = ceph_vino_to_ino(vino);
43
44 inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
45 if (inode == NULL)
46 return ERR_PTR(-ENOMEM);
47 if (inode->i_state & I_NEW) {
48 dout("get_inode created new inode %p %llx.%llx ino %llx\n",
49 inode, ceph_vinop(inode), (u64)inode->i_ino);
50 unlock_new_inode(inode);
51 }
52
53 dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
54 vino.snap, inode);
55 return inode;
56}
57
58/*
59 * get/constuct snapdir inode for a given directory
60 */
61struct inode *ceph_get_snapdir(struct inode *parent)
62{
63 struct ceph_vino vino = {
64 .ino = ceph_ino(parent),
65 .snap = CEPH_SNAPDIR,
66 };
67 struct inode *inode = ceph_get_inode(parent->i_sb, vino);
68 struct ceph_inode_info *ci = ceph_inode(inode);
69
70 BUG_ON(!S_ISDIR(parent->i_mode));
71 if (IS_ERR(inode))
72 return ERR_PTR(PTR_ERR(inode));
73 inode->i_mode = parent->i_mode;
74 inode->i_uid = parent->i_uid;
75 inode->i_gid = parent->i_gid;
76 inode->i_op = &ceph_dir_iops;
77 inode->i_fop = &ceph_dir_fops;
78 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
79 ci->i_rbytes = 0;
80 return inode;
81}
82
83const struct inode_operations ceph_file_iops = {
84 .permission = ceph_permission,
85 .setattr = ceph_setattr,
86 .getattr = ceph_getattr,
87 .setxattr = ceph_setxattr,
88 .getxattr = ceph_getxattr,
89 .listxattr = ceph_listxattr,
90 .removexattr = ceph_removexattr,
91};
92
93
94/*
95 * We use a 'frag tree' to keep track of the MDS's directory fragments
96 * for a given inode (usually there is just a single fragment). We
97 * need to know when a child frag is delegated to a new MDS, or when
98 * it is flagged as replicated, so we can direct our requests
99 * accordingly.
100 */
101
102/*
103 * find/create a frag in the tree
104 */
105static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
106 u32 f)
107{
108 struct rb_node **p;
109 struct rb_node *parent = NULL;
110 struct ceph_inode_frag *frag;
111 int c;
112
113 p = &ci->i_fragtree.rb_node;
114 while (*p) {
115 parent = *p;
116 frag = rb_entry(parent, struct ceph_inode_frag, node);
117 c = ceph_frag_compare(f, frag->frag);
118 if (c < 0)
119 p = &(*p)->rb_left;
120 else if (c > 0)
121 p = &(*p)->rb_right;
122 else
123 return frag;
124 }
125
126 frag = kmalloc(sizeof(*frag), GFP_NOFS);
127 if (!frag) {
128 pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
129 "frag %x\n", &ci->vfs_inode,
130 ceph_vinop(&ci->vfs_inode), f);
131 return ERR_PTR(-ENOMEM);
132 }
133 frag->frag = f;
134 frag->split_by = 0;
135 frag->mds = -1;
136 frag->ndist = 0;
137
138 rb_link_node(&frag->node, parent, p);
139 rb_insert_color(&frag->node, &ci->i_fragtree);
140
141 dout("get_or_create_frag added %llx.%llx frag %x\n",
142 ceph_vinop(&ci->vfs_inode), f);
143 return frag;
144}
145
146/*
147 * find a specific frag @f
148 */
149struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
150{
151 struct rb_node *n = ci->i_fragtree.rb_node;
152
153 while (n) {
154 struct ceph_inode_frag *frag =
155 rb_entry(n, struct ceph_inode_frag, node);
156 int c = ceph_frag_compare(f, frag->frag);
157 if (c < 0)
158 n = n->rb_left;
159 else if (c > 0)
160 n = n->rb_right;
161 else
162 return frag;
163 }
164 return NULL;
165}
166
167/*
168 * Choose frag containing the given value @v. If @pfrag is
169 * specified, copy the frag delegation info to the caller if
170 * it is present.
171 */
172u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
173 struct ceph_inode_frag *pfrag,
174 int *found)
175{
176 u32 t = ceph_frag_make(0, 0);
177 struct ceph_inode_frag *frag;
178 unsigned nway, i;
179 u32 n;
180
181 if (found)
182 *found = 0;
183
184 mutex_lock(&ci->i_fragtree_mutex);
185 while (1) {
186 WARN_ON(!ceph_frag_contains_value(t, v));
187 frag = __ceph_find_frag(ci, t);
188 if (!frag)
189 break; /* t is a leaf */
190 if (frag->split_by == 0) {
191 if (pfrag)
192 memcpy(pfrag, frag, sizeof(*pfrag));
193 if (found)
194 *found = 1;
195 break;
196 }
197
198 /* choose child */
199 nway = 1 << frag->split_by;
200 dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
201 frag->split_by, nway);
202 for (i = 0; i < nway; i++) {
203 n = ceph_frag_make_child(t, frag->split_by, i);
204 if (ceph_frag_contains_value(n, v)) {
205 t = n;
206 break;
207 }
208 }
209 BUG_ON(i == nway);
210 }
211 dout("choose_frag(%x) = %x\n", v, t);
212
213 mutex_unlock(&ci->i_fragtree_mutex);
214 return t;
215}
216
217/*
218 * Process dirfrag (delegation) info from the mds. Include leaf
219 * fragment in tree ONLY if ndist > 0. Otherwise, only
220 * branches/splits are included in i_fragtree)
221 */
222static int ceph_fill_dirfrag(struct inode *inode,
223 struct ceph_mds_reply_dirfrag *dirinfo)
224{
225 struct ceph_inode_info *ci = ceph_inode(inode);
226 struct ceph_inode_frag *frag;
227 u32 id = le32_to_cpu(dirinfo->frag);
228 int mds = le32_to_cpu(dirinfo->auth);
229 int ndist = le32_to_cpu(dirinfo->ndist);
230 int i;
231 int err = 0;
232
233 mutex_lock(&ci->i_fragtree_mutex);
234 if (ndist == 0) {
235 /* no delegation info needed. */
236 frag = __ceph_find_frag(ci, id);
237 if (!frag)
238 goto out;
239 if (frag->split_by == 0) {
240 /* tree leaf, remove */
241 dout("fill_dirfrag removed %llx.%llx frag %x"
242 " (no ref)\n", ceph_vinop(inode), id);
243 rb_erase(&frag->node, &ci->i_fragtree);
244 kfree(frag);
245 } else {
246 /* tree branch, keep and clear */
247 dout("fill_dirfrag cleared %llx.%llx frag %x"
248 " referral\n", ceph_vinop(inode), id);
249 frag->mds = -1;
250 frag->ndist = 0;
251 }
252 goto out;
253 }
254
255
256 /* find/add this frag to store mds delegation info */
257 frag = __get_or_create_frag(ci, id);
258 if (IS_ERR(frag)) {
259 /* this is not the end of the world; we can continue
260 with bad/inaccurate delegation info */
261 pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
262 ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
263 err = -ENOMEM;
264 goto out;
265 }
266
267 frag->mds = mds;
268 frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
269 for (i = 0; i < frag->ndist; i++)
270 frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
271 dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
272 ceph_vinop(inode), frag->frag, frag->ndist);
273
274out:
275 mutex_unlock(&ci->i_fragtree_mutex);
276 return err;
277}
278
279
280/*
281 * initialize a newly allocated inode.
282 */
283struct inode *ceph_alloc_inode(struct super_block *sb)
284{
285 struct ceph_inode_info *ci;
286 int i;
287
288 ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
289 if (!ci)
290 return NULL;
291
292 dout("alloc_inode %p\n", &ci->vfs_inode);
293
294 ci->i_version = 0;
295 ci->i_time_warp_seq = 0;
296 ci->i_ceph_flags = 0;
297 ci->i_release_count = 0;
298 ci->i_symlink = NULL;
299
300 ci->i_fragtree = RB_ROOT;
301 mutex_init(&ci->i_fragtree_mutex);
302
303 ci->i_xattrs.blob = NULL;
304 ci->i_xattrs.prealloc_blob = NULL;
305 ci->i_xattrs.dirty = false;
306 ci->i_xattrs.index = RB_ROOT;
307 ci->i_xattrs.count = 0;
308 ci->i_xattrs.names_size = 0;
309 ci->i_xattrs.vals_size = 0;
310 ci->i_xattrs.version = 0;
311 ci->i_xattrs.index_version = 0;
312
313 ci->i_caps = RB_ROOT;
314 ci->i_auth_cap = NULL;
315 ci->i_dirty_caps = 0;
316 ci->i_flushing_caps = 0;
317 INIT_LIST_HEAD(&ci->i_dirty_item);
318 INIT_LIST_HEAD(&ci->i_flushing_item);
319 ci->i_cap_flush_seq = 0;
320 ci->i_cap_flush_last_tid = 0;
321 memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
322 init_waitqueue_head(&ci->i_cap_wq);
323 ci->i_hold_caps_min = 0;
324 ci->i_hold_caps_max = 0;
325 INIT_LIST_HEAD(&ci->i_cap_delay_list);
326 ci->i_cap_exporting_mds = 0;
327 ci->i_cap_exporting_mseq = 0;
328 ci->i_cap_exporting_issued = 0;
329 INIT_LIST_HEAD(&ci->i_cap_snaps);
330 ci->i_head_snapc = NULL;
331 ci->i_snap_caps = 0;
332
333 for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
334 ci->i_nr_by_mode[i] = 0;
335
336 ci->i_truncate_seq = 0;
337 ci->i_truncate_size = 0;
338 ci->i_truncate_pending = 0;
339
340 ci->i_max_size = 0;
341 ci->i_reported_size = 0;
342 ci->i_wanted_max_size = 0;
343 ci->i_requested_max_size = 0;
344
345 ci->i_pin_ref = 0;
346 ci->i_rd_ref = 0;
347 ci->i_rdcache_ref = 0;
348 ci->i_wr_ref = 0;
349 ci->i_wrbuffer_ref = 0;
350 ci->i_wrbuffer_ref_head = 0;
351 ci->i_shared_gen = 0;
352 ci->i_rdcache_gen = 0;
353 ci->i_rdcache_revoking = 0;
354
355 INIT_LIST_HEAD(&ci->i_unsafe_writes);
356 INIT_LIST_HEAD(&ci->i_unsafe_dirops);
357 spin_lock_init(&ci->i_unsafe_lock);
358
359 ci->i_snap_realm = NULL;
360 INIT_LIST_HEAD(&ci->i_snap_realm_item);
361 INIT_LIST_HEAD(&ci->i_snap_flush_item);
362
363 INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
364 INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
365
366 INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
367
368 return &ci->vfs_inode;
369}
370
371void ceph_destroy_inode(struct inode *inode)
372{
373 struct ceph_inode_info *ci = ceph_inode(inode);
374 struct ceph_inode_frag *frag;
375 struct rb_node *n;
376
377 dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
378
379 ceph_queue_caps_release(inode);
380
381 /*
382 * we may still have a snap_realm reference if there are stray
383 * caps in i_cap_exporting_issued or i_snap_caps.
384 */
385 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc =
387 &ceph_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm;
389
390 dout(" dropping residual ref to snap realm %p\n", realm);
391 spin_lock(&realm->inodes_with_caps_lock);
392 list_del_init(&ci->i_snap_realm_item);
393 spin_unlock(&realm->inodes_with_caps_lock);
394 ceph_put_snap_realm(mdsc, realm);
395 }
396
397 kfree(ci->i_symlink);
398 while ((n = rb_first(&ci->i_fragtree)) != NULL) {
399 frag = rb_entry(n, struct ceph_inode_frag, node);
400 rb_erase(n, &ci->i_fragtree);
401 kfree(frag);
402 }
403
404 __ceph_destroy_xattrs(ci);
405 if (ci->i_xattrs.blob)
406 ceph_buffer_put(ci->i_xattrs.blob);
407 if (ci->i_xattrs.prealloc_blob)
408 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
409
410 kmem_cache_free(ceph_inode_cachep, ci);
411}
412
413
414/*
415 * Helpers to fill in size, ctime, mtime, and atime. We have to be
416 * careful because either the client or MDS may have more up to date
417 * info, depending on which capabilities are held, and whether
418 * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
419 * and size are monotonically increasing, except when utimes() or
420 * truncate() increments the corresponding _seq values.)
421 */
422int ceph_fill_file_size(struct inode *inode, int issued,
423 u32 truncate_seq, u64 truncate_size, u64 size)
424{
425 struct ceph_inode_info *ci = ceph_inode(inode);
426 int queue_trunc = 0;
427
428 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
429 (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
430 dout("size %lld -> %llu\n", inode->i_size, size);
431 inode->i_size = size;
432 inode->i_blocks = (size + (1<<9) - 1) >> 9;
433 ci->i_reported_size = size;
434 if (truncate_seq != ci->i_truncate_seq) {
435 dout("truncate_seq %u -> %u\n",
436 ci->i_truncate_seq, truncate_seq);
437 ci->i_truncate_seq = truncate_seq;
438 /*
439 * If we hold relevant caps, or in the case where we're
440 * not the only client referencing this file and we
441 * don't hold those caps, then we need to check whether
442 * the file is either opened or mmaped
443 */
444 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
445 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
446 CEPH_CAP_FILE_EXCL)) ||
447 mapping_mapped(inode->i_mapping) ||
448 __ceph_caps_file_wanted(ci)) {
449 ci->i_truncate_pending++;
450 queue_trunc = 1;
451 }
452 }
453 }
454 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
455 ci->i_truncate_size != truncate_size) {
456 dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
457 truncate_size);
458 ci->i_truncate_size = truncate_size;
459 }
460 return queue_trunc;
461}
462
463void ceph_fill_file_time(struct inode *inode, int issued,
464 u64 time_warp_seq, struct timespec *ctime,
465 struct timespec *mtime, struct timespec *atime)
466{
467 struct ceph_inode_info *ci = ceph_inode(inode);
468 int warn = 0;
469
470 if (issued & (CEPH_CAP_FILE_EXCL|
471 CEPH_CAP_FILE_WR|
472 CEPH_CAP_FILE_BUFFER)) {
473 if (timespec_compare(ctime, &inode->i_ctime) > 0) {
474 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
475 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
476 ctime->tv_sec, ctime->tv_nsec);
477 inode->i_ctime = *ctime;
478 }
479 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
480 /* the MDS did a utimes() */
481 dout("mtime %ld.%09ld -> %ld.%09ld "
482 "tw %d -> %d\n",
483 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
484 mtime->tv_sec, mtime->tv_nsec,
485 ci->i_time_warp_seq, (int)time_warp_seq);
486
487 inode->i_mtime = *mtime;
488 inode->i_atime = *atime;
489 ci->i_time_warp_seq = time_warp_seq;
490 } else if (time_warp_seq == ci->i_time_warp_seq) {
491 /* nobody did utimes(); take the max */
492 if (timespec_compare(mtime, &inode->i_mtime) > 0) {
493 dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
494 inode->i_mtime.tv_sec,
495 inode->i_mtime.tv_nsec,
496 mtime->tv_sec, mtime->tv_nsec);
497 inode->i_mtime = *mtime;
498 }
499 if (timespec_compare(atime, &inode->i_atime) > 0) {
500 dout("atime %ld.%09ld -> %ld.%09ld inc\n",
501 inode->i_atime.tv_sec,
502 inode->i_atime.tv_nsec,
503 atime->tv_sec, atime->tv_nsec);
504 inode->i_atime = *atime;
505 }
506 } else if (issued & CEPH_CAP_FILE_EXCL) {
507 /* we did a utimes(); ignore mds values */
508 } else {
509 warn = 1;
510 }
511 } else {
512 /* we have no write caps; whatever the MDS says is true */
513 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
514 inode->i_ctime = *ctime;
515 inode->i_mtime = *mtime;
516 inode->i_atime = *atime;
517 ci->i_time_warp_seq = time_warp_seq;
518 } else {
519 warn = 1;
520 }
521 }
522 if (warn) /* time_warp_seq shouldn't go backwards */
523 dout("%p mds time_warp_seq %llu < %u\n",
524 inode, time_warp_seq, ci->i_time_warp_seq);
525}
526
527/*
528 * Populate an inode based on info from mds. May be called on new or
529 * existing inodes.
530 */
531static int fill_inode(struct inode *inode,
532 struct ceph_mds_reply_info_in *iinfo,
533 struct ceph_mds_reply_dirfrag *dirinfo,
534 struct ceph_mds_session *session,
535 unsigned long ttl_from, int cap_fmode,
536 struct ceph_cap_reservation *caps_reservation)
537{
538 struct ceph_mds_reply_inode *info = iinfo->in;
539 struct ceph_inode_info *ci = ceph_inode(inode);
540 int i;
541 int issued, implemented;
542 struct timespec mtime, atime, ctime;
543 u32 nsplits;
544 struct ceph_buffer *xattr_blob = NULL;
545 int err = 0;
546 int queue_trunc = 0;
547
548 dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
549 inode, ceph_vinop(inode), le64_to_cpu(info->version),
550 ci->i_version);
551
552 /*
553 * prealloc xattr data, if it looks like we'll need it. only
554 * if len > 4 (meaning there are actually xattrs; the first 4
555 * bytes are the xattr count).
556 */
557 if (iinfo->xattr_len > 4) {
558 xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
559 if (!xattr_blob)
560 pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
561 iinfo->xattr_len);
562 }
563
564 spin_lock(&inode->i_lock);
565
566 /*
567 * provided version will be odd if inode value is projected,
568 * even if stable. skip the update if we have a newer info
569 * (e.g., due to inode info racing form multiple MDSs), or if
570 * we are getting projected (unstable) inode info.
571 */
572 if (le64_to_cpu(info->version) > 0 &&
573 (ci->i_version & ~1) > le64_to_cpu(info->version))
574 goto no_change;
575
576 issued = __ceph_caps_issued(ci, &implemented);
577 issued |= implemented | __ceph_caps_dirty(ci);
578
579 /* update inode */
580 ci->i_version = le64_to_cpu(info->version);
581 inode->i_version++;
582 inode->i_rdev = le32_to_cpu(info->rdev);
583
584 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
585 inode->i_mode = le32_to_cpu(info->mode);
586 inode->i_uid = le32_to_cpu(info->uid);
587 inode->i_gid = le32_to_cpu(info->gid);
588 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
589 inode->i_uid, inode->i_gid);
590 }
591
592 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
593 inode->i_nlink = le32_to_cpu(info->nlink);
594
595 /* be careful with mtime, atime, size */
596 ceph_decode_timespec(&atime, &info->atime);
597 ceph_decode_timespec(&mtime, &info->mtime);
598 ceph_decode_timespec(&ctime, &info->ctime);
599 queue_trunc = ceph_fill_file_size(inode, issued,
600 le32_to_cpu(info->truncate_seq),
601 le64_to_cpu(info->truncate_size),
602 le64_to_cpu(info->size));
603 ceph_fill_file_time(inode, issued,
604 le32_to_cpu(info->time_warp_seq),
605 &ctime, &mtime, &atime);
606
607 ci->i_max_size = le64_to_cpu(info->max_size);
608 ci->i_layout = info->layout;
609 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
610
611 /* xattrs */
612 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
613 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
614 le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
615 if (ci->i_xattrs.blob)
616 ceph_buffer_put(ci->i_xattrs.blob);
617 ci->i_xattrs.blob = xattr_blob;
618 if (xattr_blob)
619 memcpy(ci->i_xattrs.blob->vec.iov_base,
620 iinfo->xattr_data, iinfo->xattr_len);
621 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
622 }
623
624 inode->i_mapping->a_ops = &ceph_aops;
625 inode->i_mapping->backing_dev_info =
626 &ceph_client(inode->i_sb)->backing_dev_info;
627
628 switch (inode->i_mode & S_IFMT) {
629 case S_IFIFO:
630 case S_IFBLK:
631 case S_IFCHR:
632 case S_IFSOCK:
633 init_special_inode(inode, inode->i_mode, inode->i_rdev);
634 inode->i_op = &ceph_file_iops;
635 break;
636 case S_IFREG:
637 inode->i_op = &ceph_file_iops;
638 inode->i_fop = &ceph_file_fops;
639 break;
640 case S_IFLNK:
641 inode->i_op = &ceph_symlink_iops;
642 if (!ci->i_symlink) {
643 int symlen = iinfo->symlink_len;
644 char *sym;
645
646 BUG_ON(symlen != inode->i_size);
647 spin_unlock(&inode->i_lock);
648
649 err = -ENOMEM;
650 sym = kmalloc(symlen+1, GFP_NOFS);
651 if (!sym)
652 goto out;
653 memcpy(sym, iinfo->symlink, symlen);
654 sym[symlen] = 0;
655
656 spin_lock(&inode->i_lock);
657 if (!ci->i_symlink)
658 ci->i_symlink = sym;
659 else
660 kfree(sym); /* lost a race */
661 }
662 break;
663 case S_IFDIR:
664 inode->i_op = &ceph_dir_iops;
665 inode->i_fop = &ceph_dir_fops;
666
667 ci->i_files = le64_to_cpu(info->files);
668 ci->i_subdirs = le64_to_cpu(info->subdirs);
669 ci->i_rbytes = le64_to_cpu(info->rbytes);
670 ci->i_rfiles = le64_to_cpu(info->rfiles);
671 ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
672 ceph_decode_timespec(&ci->i_rctime, &info->rctime);
673
674 /* set dir completion flag? */
675 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
676 ceph_snap(inode) == CEPH_NOSNAP &&
677 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
678 dout(" marking %p complete (empty)\n", inode);
679 ci->i_ceph_flags |= CEPH_I_COMPLETE;
680 ci->i_max_offset = 2;
681 }
682
683 /* it may be better to set st_size in getattr instead? */
684 if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
685 inode->i_size = ci->i_rbytes;
686 break;
687 default:
688 pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
689 ceph_vinop(inode), inode->i_mode);
690 }
691
692no_change:
693 spin_unlock(&inode->i_lock);
694
695 /* queue truncate if we saw i_size decrease */
696 if (queue_trunc)
697 ceph_queue_vmtruncate(inode);
698
699 /* populate frag tree */
700 /* FIXME: move me up, if/when version reflects fragtree changes */
701 nsplits = le32_to_cpu(info->fragtree.nsplits);
702 mutex_lock(&ci->i_fragtree_mutex);
703 for (i = 0; i < nsplits; i++) {
704 u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
705 struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
706
707 if (IS_ERR(frag))
708 continue;
709 frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
710 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
711 }
712 mutex_unlock(&ci->i_fragtree_mutex);
713
714 /* were we issued a capability? */
715 if (info->cap.caps) {
716 if (ceph_snap(inode) == CEPH_NOSNAP) {
717 ceph_add_cap(inode, session,
718 le64_to_cpu(info->cap.cap_id),
719 cap_fmode,
720 le32_to_cpu(info->cap.caps),
721 le32_to_cpu(info->cap.wanted),
722 le32_to_cpu(info->cap.seq),
723 le32_to_cpu(info->cap.mseq),
724 le64_to_cpu(info->cap.realm),
725 info->cap.flags,
726 caps_reservation);
727 } else {
728 spin_lock(&inode->i_lock);
729 dout(" %p got snap_caps %s\n", inode,
730 ceph_cap_string(le32_to_cpu(info->cap.caps)));
731 ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
732 if (cap_fmode >= 0)
733 __ceph_get_fmode(ci, cap_fmode);
734 spin_unlock(&inode->i_lock);
735 }
736 }
737
738 /* update delegation info? */
739 if (dirinfo)
740 ceph_fill_dirfrag(inode, dirinfo);
741
742 err = 0;
743
744out:
745 if (xattr_blob)
746 ceph_buffer_put(xattr_blob);
747 return err;
748}
749
750/*
751 * caller should hold session s_mutex.
752 */
753static void update_dentry_lease(struct dentry *dentry,
754 struct ceph_mds_reply_lease *lease,
755 struct ceph_mds_session *session,
756 unsigned long from_time)
757{
758 struct ceph_dentry_info *di = ceph_dentry(dentry);
759 long unsigned duration = le32_to_cpu(lease->duration_ms);
760 long unsigned ttl = from_time + (duration * HZ) / 1000;
761 long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
762 struct inode *dir;
763
764 /* only track leases on regular dentries */
765 if (dentry->d_op != &ceph_dentry_ops)
766 return;
767
768 spin_lock(&dentry->d_lock);
769 dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
770 dentry, le16_to_cpu(lease->mask), duration, ttl);
771
772 /* make lease_rdcache_gen match directory */
773 dir = dentry->d_parent->d_inode;
774 di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
775
776 if (lease->mask == 0)
777 goto out_unlock;
778
779 if (di->lease_gen == session->s_cap_gen &&
780 time_before(ttl, dentry->d_time))
781 goto out_unlock; /* we already have a newer lease. */
782
783 if (di->lease_session && di->lease_session != session)
784 goto out_unlock;
785
786 ceph_dentry_lru_touch(dentry);
787
788 if (!di->lease_session)
789 di->lease_session = ceph_get_mds_session(session);
790 di->lease_gen = session->s_cap_gen;
791 di->lease_seq = le32_to_cpu(lease->seq);
792 di->lease_renew_after = half_ttl;
793 di->lease_renew_from = 0;
794 dentry->d_time = ttl;
795out_unlock:
796 spin_unlock(&dentry->d_lock);
797 return;
798}
799
800/*
801 * splice a dentry to an inode.
802 * caller must hold directory i_mutex for this to be safe.
803 *
804 * we will only rehash the resulting dentry if @prehash is
805 * true; @prehash will be set to false (for the benefit of
806 * the caller) if we fail.
807 */
808static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
809 bool *prehash)
810{
811 struct dentry *realdn;
812
813 /* dn must be unhashed */
814 if (!d_unhashed(dn))
815 d_drop(dn);
816 realdn = d_materialise_unique(dn, in);
817 if (IS_ERR(realdn)) {
818 pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
819 dn, in, ceph_vinop(in));
820 if (prehash)
821 *prehash = false; /* don't rehash on error */
822 dn = realdn; /* note realdn contains the error */
823 goto out;
824 } else if (realdn) {
825 dout("dn %p (%d) spliced with %p (%d) "
826 "inode %p ino %llx.%llx\n",
827 dn, atomic_read(&dn->d_count),
828 realdn, atomic_read(&realdn->d_count),
829 realdn->d_inode, ceph_vinop(realdn->d_inode));
830 dput(dn);
831 dn = realdn;
832 } else {
833 BUG_ON(!ceph_dentry(dn));
834
835 dout("dn %p attached to %p ino %llx.%llx\n",
836 dn, dn->d_inode, ceph_vinop(dn->d_inode));
837 }
838 if ((!prehash || *prehash) && d_unhashed(dn))
839 d_rehash(dn);
840out:
841 return dn;
842}
843
844/*
845 * Set dentry's directory position based on the current dir's max, and
846 * order it in d_subdirs, so that dcache_readdir behaves.
847 */
848static void ceph_set_dentry_offset(struct dentry *dn)
849{
850 struct dentry *dir = dn->d_parent;
851 struct inode *inode = dn->d_parent->d_inode;
852 struct ceph_dentry_info *di;
853
854 BUG_ON(!inode);
855
856 di = ceph_dentry(dn);
857
858 spin_lock(&inode->i_lock);
859 di->offset = ceph_inode(inode)->i_max_offset++;
860 spin_unlock(&inode->i_lock);
861
862 spin_lock(&dcache_lock);
863 spin_lock(&dn->d_lock);
864 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
865 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
866 dn->d_u.d_child.prev, dn->d_u.d_child.next);
867 spin_unlock(&dn->d_lock);
868 spin_unlock(&dcache_lock);
869}
870
871/*
872 * Incorporate results into the local cache. This is either just
873 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
874 * after a lookup).
875 *
876 * A reply may contain
877 * a directory inode along with a dentry.
878 * and/or a target inode
879 *
880 * Called with snap_rwsem (read).
881 */
882int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
883 struct ceph_mds_session *session)
884{
885 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
886 struct inode *in = NULL;
887 struct ceph_mds_reply_inode *ininfo;
888 struct ceph_vino vino;
889 struct ceph_client *client = ceph_sb_to_client(sb);
890 int i = 0;
891 int err = 0;
892
893 dout("fill_trace %p is_dentry %d is_target %d\n", req,
894 rinfo->head->is_dentry, rinfo->head->is_target);
895
896#if 0
897 /*
898 * Debugging hook:
899 *
900 * If we resend completed ops to a recovering mds, we get no
901 * trace. Since that is very rare, pretend this is the case
902 * to ensure the 'no trace' handlers in the callers behave.
903 *
904 * Fill in inodes unconditionally to avoid breaking cap
905 * invariants.
906 */
907 if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
908 pr_info("fill_trace faking empty trace on %lld %s\n",
909 req->r_tid, ceph_mds_op_name(rinfo->head->op));
910 if (rinfo->head->is_dentry) {
911 rinfo->head->is_dentry = 0;
912 err = fill_inode(req->r_locked_dir,
913 &rinfo->diri, rinfo->dirfrag,
914 session, req->r_request_started, -1);
915 }
916 if (rinfo->head->is_target) {
917 rinfo->head->is_target = 0;
918 ininfo = rinfo->targeti.in;
919 vino.ino = le64_to_cpu(ininfo->ino);
920 vino.snap = le64_to_cpu(ininfo->snapid);
921 in = ceph_get_inode(sb, vino);
922 err = fill_inode(in, &rinfo->targeti, NULL,
923 session, req->r_request_started,
924 req->r_fmode);
925 iput(in);
926 }
927 }
928#endif
929
930 if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
931 dout("fill_trace reply is empty!\n");
932 if (rinfo->head->result == 0 && req->r_locked_dir) {
933 struct ceph_inode_info *ci =
934 ceph_inode(req->r_locked_dir);
935 dout(" clearing %p complete (empty trace)\n",
936 req->r_locked_dir);
937 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
938 ci->i_release_count++;
939 }
940 return 0;
941 }
942
943 if (rinfo->head->is_dentry) {
944 struct inode *dir = req->r_locked_dir;
945
946 err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
947 session, req->r_request_started, -1,
948 &req->r_caps_reservation);
949 if (err < 0)
950 return err;
951 }
952
953 /*
954 * ignore null lease/binding on snapdir ENOENT, or else we
955 * will have trouble splicing in the virtual snapdir later
956 */
957 if (rinfo->head->is_dentry && !req->r_aborted &&
958 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
959 client->mount_args->snapdir_name,
960 req->r_dentry->d_name.len))) {
961 /*
962 * lookup link rename : null -> possibly existing inode
963 * mknod symlink mkdir : null -> new inode
964 * unlink : linked -> null
965 */
966 struct inode *dir = req->r_locked_dir;
967 struct dentry *dn = req->r_dentry;
968 bool have_dir_cap, have_lease;
969
970 BUG_ON(!dn);
971 BUG_ON(!dir);
972 BUG_ON(dn->d_parent->d_inode != dir);
973 BUG_ON(ceph_ino(dir) !=
974 le64_to_cpu(rinfo->diri.in->ino));
975 BUG_ON(ceph_snap(dir) !=
976 le64_to_cpu(rinfo->diri.in->snapid));
977
978 /* do we have a lease on the whole dir? */
979 have_dir_cap =
980 (le32_to_cpu(rinfo->diri.in->cap.caps) &
981 CEPH_CAP_FILE_SHARED);
982
983 /* do we have a dn lease? */
984 have_lease = have_dir_cap ||
985 (le16_to_cpu(rinfo->dlease->mask) &
986 CEPH_LOCK_DN);
987
988 if (!have_lease)
989 dout("fill_trace no dentry lease or dir cap\n");
990
991 /* rename? */
992 if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
993 dout(" src %p '%.*s' dst %p '%.*s'\n",
994 req->r_old_dentry,
995 req->r_old_dentry->d_name.len,
996 req->r_old_dentry->d_name.name,
997 dn, dn->d_name.len, dn->d_name.name);
998 dout("fill_trace doing d_move %p -> %p\n",
999 req->r_old_dentry, dn);
1000 d_move(req->r_old_dentry, dn);
1001 dout(" src %p '%.*s' dst %p '%.*s'\n",
1002 req->r_old_dentry,
1003 req->r_old_dentry->d_name.len,
1004 req->r_old_dentry->d_name.name,
1005 dn, dn->d_name.len, dn->d_name.name);
1006 /* ensure target dentry is invalidated, despite
1007 rehashing bug in vfs_rename_dir */
1008 dn->d_time = jiffies;
1009 ceph_dentry(dn)->lease_shared_gen = 0;
1010 /* take overwritten dentry's readdir offset */
1011 ceph_dentry(req->r_old_dentry)->offset =
1012 ceph_dentry(dn)->offset;
1013 dn = req->r_old_dentry; /* use old_dentry */
1014 in = dn->d_inode;
1015 }
1016
1017 /* null dentry? */
1018 if (!rinfo->head->is_target) {
1019 dout("fill_trace null dentry\n");
1020 if (dn->d_inode) {
1021 dout("d_delete %p\n", dn);
1022 d_delete(dn);
1023 } else {
1024 dout("d_instantiate %p NULL\n", dn);
1025 d_instantiate(dn, NULL);
1026 if (have_lease && d_unhashed(dn))
1027 d_rehash(dn);
1028 update_dentry_lease(dn, rinfo->dlease,
1029 session,
1030 req->r_request_started);
1031 }
1032 goto done;
1033 }
1034
1035 /* attach proper inode */
1036 ininfo = rinfo->targeti.in;
1037 vino.ino = le64_to_cpu(ininfo->ino);
1038 vino.snap = le64_to_cpu(ininfo->snapid);
1039 if (!dn->d_inode) {
1040 in = ceph_get_inode(sb, vino);
1041 if (IS_ERR(in)) {
1042 pr_err("fill_trace bad get_inode "
1043 "%llx.%llx\n", vino.ino, vino.snap);
1044 err = PTR_ERR(in);
1045 d_delete(dn);
1046 goto done;
1047 }
1048 dn = splice_dentry(dn, in, &have_lease);
1049 if (IS_ERR(dn)) {
1050 err = PTR_ERR(dn);
1051 goto done;
1052 }
1053 req->r_dentry = dn; /* may have spliced */
1054 ceph_set_dentry_offset(dn);
1055 igrab(in);
1056 } else if (ceph_ino(in) == vino.ino &&
1057 ceph_snap(in) == vino.snap) {
1058 igrab(in);
1059 } else {
1060 dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
1061 dn, in, ceph_ino(in), ceph_snap(in),
1062 vino.ino, vino.snap);
1063 have_lease = false;
1064 in = NULL;
1065 }
1066
1067 if (have_lease)
1068 update_dentry_lease(dn, rinfo->dlease, session,
1069 req->r_request_started);
1070 dout(" final dn %p\n", dn);
1071 i++;
1072 } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
1073 req->r_op == CEPH_MDS_OP_MKSNAP) {
1074 struct dentry *dn = req->r_dentry;
1075
1076 /* fill out a snapdir LOOKUPSNAP dentry */
1077 BUG_ON(!dn);
1078 BUG_ON(!req->r_locked_dir);
1079 BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
1080 ininfo = rinfo->targeti.in;
1081 vino.ino = le64_to_cpu(ininfo->ino);
1082 vino.snap = le64_to_cpu(ininfo->snapid);
1083 in = ceph_get_inode(sb, vino);
1084 if (IS_ERR(in)) {
1085 pr_err("fill_inode get_inode badness %llx.%llx\n",
1086 vino.ino, vino.snap);
1087 err = PTR_ERR(in);
1088 d_delete(dn);
1089 goto done;
1090 }
1091 dout(" linking snapped dir %p to dn %p\n", in, dn);
1092 dn = splice_dentry(dn, in, NULL);
1093 if (IS_ERR(dn)) {
1094 err = PTR_ERR(dn);
1095 goto done;
1096 }
1097 ceph_set_dentry_offset(dn);
1098 req->r_dentry = dn; /* may have spliced */
1099 igrab(in);
1100 rinfo->head->is_dentry = 1; /* fool notrace handlers */
1101 }
1102
1103 if (rinfo->head->is_target) {
1104 vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1105 vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1106
1107 if (in == NULL || ceph_ino(in) != vino.ino ||
1108 ceph_snap(in) != vino.snap) {
1109 in = ceph_get_inode(sb, vino);
1110 if (IS_ERR(in)) {
1111 err = PTR_ERR(in);
1112 goto done;
1113 }
1114 }
1115 req->r_target_inode = in;
1116
1117 err = fill_inode(in,
1118 &rinfo->targeti, NULL,
1119 session, req->r_request_started,
1120 (le32_to_cpu(rinfo->head->result) == 0) ?
1121 req->r_fmode : -1,
1122 &req->r_caps_reservation);
1123 if (err < 0) {
1124 pr_err("fill_inode badness %p %llx.%llx\n",
1125 in, ceph_vinop(in));
1126 goto done;
1127 }
1128 }
1129
1130done:
1131 dout("fill_trace done err=%d\n", err);
1132 return err;
1133}
1134
1135/*
1136 * Prepopulate our cache with readdir results, leases, etc.
1137 */
1138int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1139 struct ceph_mds_session *session)
1140{
1141 struct dentry *parent = req->r_dentry;
1142 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1143 struct qstr dname;
1144 struct dentry *dn;
1145 struct inode *in;
1146 int err = 0, i;
1147 struct inode *snapdir = NULL;
1148 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
1149 u64 frag = le32_to_cpu(rhead->args.readdir.frag);
1150 struct ceph_dentry_info *di;
1151
1152 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
1153 snapdir = ceph_get_snapdir(parent->d_inode);
1154 parent = d_find_alias(snapdir);
1155 dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
1156 rinfo->dir_nr, parent);
1157 } else {
1158 dout("readdir_prepopulate %d items under dn %p\n",
1159 rinfo->dir_nr, parent);
1160 if (rinfo->dir_dir)
1161 ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
1162 }
1163
1164 for (i = 0; i < rinfo->dir_nr; i++) {
1165 struct ceph_vino vino;
1166
1167 dname.name = rinfo->dir_dname[i];
1168 dname.len = rinfo->dir_dname_len[i];
1169 dname.hash = full_name_hash(dname.name, dname.len);
1170
1171 vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
1172 vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
1173
1174retry_lookup:
1175 dn = d_lookup(parent, &dname);
1176 dout("d_lookup on parent=%p name=%.*s got %p\n",
1177 parent, dname.len, dname.name, dn);
1178
1179 if (!dn) {
1180 dn = d_alloc(parent, &dname);
1181 dout("d_alloc %p '%.*s' = %p\n", parent,
1182 dname.len, dname.name, dn);
1183 if (dn == NULL) {
1184 dout("d_alloc badness\n");
1185 err = -ENOMEM;
1186 goto out;
1187 }
1188 err = ceph_init_dentry(dn);
1189 if (err < 0)
1190 goto out;
1191 } else if (dn->d_inode &&
1192 (ceph_ino(dn->d_inode) != vino.ino ||
1193 ceph_snap(dn->d_inode) != vino.snap)) {
1194 dout(" dn %p points to wrong inode %p\n",
1195 dn, dn->d_inode);
1196 d_delete(dn);
1197 dput(dn);
1198 goto retry_lookup;
1199 } else {
1200 /* reorder parent's d_subdirs */
1201 spin_lock(&dcache_lock);
1202 spin_lock(&dn->d_lock);
1203 list_move(&dn->d_u.d_child, &parent->d_subdirs);
1204 spin_unlock(&dn->d_lock);
1205 spin_unlock(&dcache_lock);
1206 }
1207
1208 di = dn->d_fsdata;
1209 di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
1210
1211 /* inode */
1212 if (dn->d_inode) {
1213 in = dn->d_inode;
1214 } else {
1215 in = ceph_get_inode(parent->d_sb, vino);
1216 if (in == NULL) {
1217 dout("new_inode badness\n");
1218 d_delete(dn);
1219 dput(dn);
1220 err = -ENOMEM;
1221 goto out;
1222 }
1223 dn = splice_dentry(dn, in, NULL);
1224 }
1225
1226 if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
1227 req->r_request_started, -1,
1228 &req->r_caps_reservation) < 0) {
1229 pr_err("fill_inode badness on %p\n", in);
1230 dput(dn);
1231 continue;
1232 }
1233 update_dentry_lease(dn, rinfo->dir_dlease[i],
1234 req->r_session, req->r_request_started);
1235 dput(dn);
1236 }
1237 req->r_did_prepopulate = true;
1238
1239out:
1240 if (snapdir) {
1241 iput(snapdir);
1242 dput(parent);
1243 }
1244 dout("readdir_prepopulate done\n");
1245 return err;
1246}
1247
1248int ceph_inode_set_size(struct inode *inode, loff_t size)
1249{
1250 struct ceph_inode_info *ci = ceph_inode(inode);
1251 int ret = 0;
1252
1253 spin_lock(&inode->i_lock);
1254 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
1255 inode->i_size = size;
1256 inode->i_blocks = (size + (1 << 9) - 1) >> 9;
1257
1258 /* tell the MDS if we are approaching max_size */
1259 if ((size << 1) >= ci->i_max_size &&
1260 (ci->i_reported_size << 1) < ci->i_max_size)
1261 ret = 1;
1262
1263 spin_unlock(&inode->i_lock);
1264 return ret;
1265}
1266
1267/*
1268 * Write back inode data in a worker thread. (This can't be done
1269 * in the message handler context.)
1270 */
1271void ceph_queue_writeback(struct inode *inode)
1272{
1273 if (queue_work(ceph_inode_to_client(inode)->wb_wq,
1274 &ceph_inode(inode)->i_wb_work)) {
1275 dout("ceph_queue_writeback %p\n", inode);
1276 igrab(inode);
1277 } else {
1278 dout("ceph_queue_writeback %p failed\n", inode);
1279 }
1280}
1281
1282static void ceph_writeback_work(struct work_struct *work)
1283{
1284 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1285 i_wb_work);
1286 struct inode *inode = &ci->vfs_inode;
1287
1288 dout("writeback %p\n", inode);
1289 filemap_fdatawrite(&inode->i_data);
1290 iput(inode);
1291}
1292
1293/*
1294 * queue an async invalidation
1295 */
1296void ceph_queue_invalidate(struct inode *inode)
1297{
1298 if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
1299 &ceph_inode(inode)->i_pg_inv_work)) {
1300 dout("ceph_queue_invalidate %p\n", inode);
1301 igrab(inode);
1302 } else {
1303 dout("ceph_queue_invalidate %p failed\n", inode);
1304 }
1305}
1306
1307/*
1308 * invalidate any pages that are not dirty or under writeback. this
1309 * includes pages that are clean and mapped.
1310 */
1311static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
1312{
1313 struct pagevec pvec;
1314 pgoff_t next = 0;
1315 int i;
1316
1317 pagevec_init(&pvec, 0);
1318 while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
1319 for (i = 0; i < pagevec_count(&pvec); i++) {
1320 struct page *page = pvec.pages[i];
1321 pgoff_t index;
1322 int skip_page =
1323 (PageDirty(page) || PageWriteback(page));
1324
1325 if (!skip_page)
1326 skip_page = !trylock_page(page);
1327
1328 /*
1329 * We really shouldn't be looking at the ->index of an
1330 * unlocked page. But we're not allowed to lock these
1331 * pages. So we rely upon nobody altering the ->index
1332 * of this (pinned-by-us) page.
1333 */
1334 index = page->index;
1335 if (index > next)
1336 next = index;
1337 next++;
1338
1339 if (skip_page)
1340 continue;
1341
1342 generic_error_remove_page(mapping, page);
1343 unlock_page(page);
1344 }
1345 pagevec_release(&pvec);
1346 cond_resched();
1347 }
1348}
1349
1350/*
1351 * Invalidate inode pages in a worker thread. (This can't be done
1352 * in the message handler context.)
1353 */
1354static void ceph_invalidate_work(struct work_struct *work)
1355{
1356 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1357 i_pg_inv_work);
1358 struct inode *inode = &ci->vfs_inode;
1359 u32 orig_gen;
1360 int check = 0;
1361
1362 spin_lock(&inode->i_lock);
1363 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1364 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1365 if (ci->i_rdcache_gen == 0 ||
1366 ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1367 BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
1368 /* nevermind! */
1369 ci->i_rdcache_revoking = 0;
1370 spin_unlock(&inode->i_lock);
1371 goto out;
1372 }
1373 orig_gen = ci->i_rdcache_gen;
1374 spin_unlock(&inode->i_lock);
1375
1376 ceph_invalidate_nondirty_pages(inode->i_mapping);
1377
1378 spin_lock(&inode->i_lock);
1379 if (orig_gen == ci->i_rdcache_gen) {
1380 dout("invalidate_pages %p gen %d successful\n", inode,
1381 ci->i_rdcache_gen);
1382 ci->i_rdcache_gen = 0;
1383 ci->i_rdcache_revoking = 0;
1384 check = 1;
1385 } else {
1386 dout("invalidate_pages %p gen %d raced, gen now %d\n",
1387 inode, orig_gen, ci->i_rdcache_gen);
1388 }
1389 spin_unlock(&inode->i_lock);
1390
1391 if (check)
1392 ceph_check_caps(ci, 0, NULL);
1393out:
1394 iput(inode);
1395}
1396
1397
1398/*
1399 * called by trunc_wq; take i_mutex ourselves
1400 *
1401 * We also truncate in a separate thread as well.
1402 */
1403static void ceph_vmtruncate_work(struct work_struct *work)
1404{
1405 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1406 i_vmtruncate_work);
1407 struct inode *inode = &ci->vfs_inode;
1408
1409 dout("vmtruncate_work %p\n", inode);
1410 mutex_lock(&inode->i_mutex);
1411 __ceph_do_pending_vmtruncate(inode);
1412 mutex_unlock(&inode->i_mutex);
1413 iput(inode);
1414}
1415
1416/*
1417 * Queue an async vmtruncate. If we fail to queue work, we will handle
1418 * the truncation the next time we call __ceph_do_pending_vmtruncate.
1419 */
1420void ceph_queue_vmtruncate(struct inode *inode)
1421{
1422 struct ceph_inode_info *ci = ceph_inode(inode);
1423
1424 if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
1425 &ci->i_vmtruncate_work)) {
1426 dout("ceph_queue_vmtruncate %p\n", inode);
1427 igrab(inode);
1428 } else {
1429 dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
1430 inode, ci->i_truncate_pending);
1431 }
1432}
1433
1434/*
1435 * called with i_mutex held.
1436 *
1437 * Make sure any pending truncation is applied before doing anything
1438 * that may depend on it.
1439 */
1440void __ceph_do_pending_vmtruncate(struct inode *inode)
1441{
1442 struct ceph_inode_info *ci = ceph_inode(inode);
1443 u64 to;
1444 int wrbuffer_refs, wake = 0;
1445
1446retry:
1447 spin_lock(&inode->i_lock);
1448 if (ci->i_truncate_pending == 0) {
1449 dout("__do_pending_vmtruncate %p none pending\n", inode);
1450 spin_unlock(&inode->i_lock);
1451 return;
1452 }
1453
1454 /*
1455 * make sure any dirty snapped pages are flushed before we
1456 * possibly truncate them.. so write AND block!
1457 */
1458 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
1459 dout("__do_pending_vmtruncate %p flushing snaps first\n",
1460 inode);
1461 spin_unlock(&inode->i_lock);
1462 filemap_write_and_wait_range(&inode->i_data, 0,
1463 inode->i_sb->s_maxbytes);
1464 goto retry;
1465 }
1466
1467 to = ci->i_truncate_size;
1468 wrbuffer_refs = ci->i_wrbuffer_ref;
1469 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
1470 ci->i_truncate_pending, to);
1471 spin_unlock(&inode->i_lock);
1472
1473 truncate_inode_pages(inode->i_mapping, to);
1474
1475 spin_lock(&inode->i_lock);
1476 ci->i_truncate_pending--;
1477 if (ci->i_truncate_pending == 0)
1478 wake = 1;
1479 spin_unlock(&inode->i_lock);
1480
1481 if (wrbuffer_refs == 0)
1482 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1483 if (wake)
1484 wake_up(&ci->i_cap_wq);
1485}
1486
1487
1488/*
1489 * symlinks
1490 */
1491static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
1492{
1493 struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
1494 nd_set_link(nd, ci->i_symlink);
1495 return NULL;
1496}
1497
1498static const struct inode_operations ceph_symlink_iops = {
1499 .readlink = generic_readlink,
1500 .follow_link = ceph_sym_follow_link,
1501};
1502
1503/*
1504 * setattr
1505 */
1506int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1507{
1508 struct inode *inode = dentry->d_inode;
1509 struct ceph_inode_info *ci = ceph_inode(inode);
1510 struct inode *parent_inode = dentry->d_parent->d_inode;
1511 const unsigned int ia_valid = attr->ia_valid;
1512 struct ceph_mds_request *req;
1513 struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
1514 int issued;
1515 int release = 0, dirtied = 0;
1516 int mask = 0;
1517 int err = 0;
1518
1519 if (ceph_snap(inode) != CEPH_NOSNAP)
1520 return -EROFS;
1521
1522 __ceph_do_pending_vmtruncate(inode);
1523
1524 err = inode_change_ok(inode, attr);
1525 if (err != 0)
1526 return err;
1527
1528 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
1529 USE_AUTH_MDS);
1530 if (IS_ERR(req))
1531 return PTR_ERR(req);
1532
1533 spin_lock(&inode->i_lock);
1534 issued = __ceph_caps_issued(ci, NULL);
1535 dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
1536
1537 if (ia_valid & ATTR_UID) {
1538 dout("setattr %p uid %d -> %d\n", inode,
1539 inode->i_uid, attr->ia_uid);
1540 if (issued & CEPH_CAP_AUTH_EXCL) {
1541 inode->i_uid = attr->ia_uid;
1542 dirtied |= CEPH_CAP_AUTH_EXCL;
1543 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1544 attr->ia_uid != inode->i_uid) {
1545 req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
1546 mask |= CEPH_SETATTR_UID;
1547 release |= CEPH_CAP_AUTH_SHARED;
1548 }
1549 }
1550 if (ia_valid & ATTR_GID) {
1551 dout("setattr %p gid %d -> %d\n", inode,
1552 inode->i_gid, attr->ia_gid);
1553 if (issued & CEPH_CAP_AUTH_EXCL) {
1554 inode->i_gid = attr->ia_gid;
1555 dirtied |= CEPH_CAP_AUTH_EXCL;
1556 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1557 attr->ia_gid != inode->i_gid) {
1558 req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
1559 mask |= CEPH_SETATTR_GID;
1560 release |= CEPH_CAP_AUTH_SHARED;
1561 }
1562 }
1563 if (ia_valid & ATTR_MODE) {
1564 dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
1565 attr->ia_mode);
1566 if (issued & CEPH_CAP_AUTH_EXCL) {
1567 inode->i_mode = attr->ia_mode;
1568 dirtied |= CEPH_CAP_AUTH_EXCL;
1569 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1570 attr->ia_mode != inode->i_mode) {
1571 req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
1572 mask |= CEPH_SETATTR_MODE;
1573 release |= CEPH_CAP_AUTH_SHARED;
1574 }
1575 }
1576
1577 if (ia_valid & ATTR_ATIME) {
1578 dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
1579 inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
1580 attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
1581 if (issued & CEPH_CAP_FILE_EXCL) {
1582 ci->i_time_warp_seq++;
1583 inode->i_atime = attr->ia_atime;
1584 dirtied |= CEPH_CAP_FILE_EXCL;
1585 } else if ((issued & CEPH_CAP_FILE_WR) &&
1586 timespec_compare(&inode->i_atime,
1587 &attr->ia_atime) < 0) {
1588 inode->i_atime = attr->ia_atime;
1589 dirtied |= CEPH_CAP_FILE_WR;
1590 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1591 !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
1592 ceph_encode_timespec(&req->r_args.setattr.atime,
1593 &attr->ia_atime);
1594 mask |= CEPH_SETATTR_ATIME;
1595 release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
1596 CEPH_CAP_FILE_WR;
1597 }
1598 }
1599 if (ia_valid & ATTR_MTIME) {
1600 dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
1601 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
1602 attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
1603 if (issued & CEPH_CAP_FILE_EXCL) {
1604 ci->i_time_warp_seq++;
1605 inode->i_mtime = attr->ia_mtime;
1606 dirtied |= CEPH_CAP_FILE_EXCL;
1607 } else if ((issued & CEPH_CAP_FILE_WR) &&
1608 timespec_compare(&inode->i_mtime,
1609 &attr->ia_mtime) < 0) {
1610 inode->i_mtime = attr->ia_mtime;
1611 dirtied |= CEPH_CAP_FILE_WR;
1612 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1613 !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
1614 ceph_encode_timespec(&req->r_args.setattr.mtime,
1615 &attr->ia_mtime);
1616 mask |= CEPH_SETATTR_MTIME;
1617 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1618 CEPH_CAP_FILE_WR;
1619 }
1620 }
1621 if (ia_valid & ATTR_SIZE) {
1622 dout("setattr %p size %lld -> %lld\n", inode,
1623 inode->i_size, attr->ia_size);
1624 if (attr->ia_size > inode->i_sb->s_maxbytes) {
1625 err = -EINVAL;
1626 goto out;
1627 }
1628 if ((issued & CEPH_CAP_FILE_EXCL) &&
1629 attr->ia_size > inode->i_size) {
1630 inode->i_size = attr->ia_size;
1631 inode->i_blocks =
1632 (attr->ia_size + (1 << 9) - 1) >> 9;
1633 inode->i_ctime = attr->ia_ctime;
1634 ci->i_reported_size = attr->ia_size;
1635 dirtied |= CEPH_CAP_FILE_EXCL;
1636 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1637 attr->ia_size != inode->i_size) {
1638 req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
1639 req->r_args.setattr.old_size =
1640 cpu_to_le64(inode->i_size);
1641 mask |= CEPH_SETATTR_SIZE;
1642 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1643 CEPH_CAP_FILE_WR;
1644 }
1645 }
1646
1647 /* these do nothing */
1648 if (ia_valid & ATTR_CTIME) {
1649 bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
1650 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
1651 dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
1652 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
1653 attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
1654 only ? "ctime only" : "ignored");
1655 inode->i_ctime = attr->ia_ctime;
1656 if (only) {
1657 /*
1658 * if kernel wants to dirty ctime but nothing else,
1659 * we need to choose a cap to dirty under, or do
1660 * a almost-no-op setattr
1661 */
1662 if (issued & CEPH_CAP_AUTH_EXCL)
1663 dirtied |= CEPH_CAP_AUTH_EXCL;
1664 else if (issued & CEPH_CAP_FILE_EXCL)
1665 dirtied |= CEPH_CAP_FILE_EXCL;
1666 else if (issued & CEPH_CAP_XATTR_EXCL)
1667 dirtied |= CEPH_CAP_XATTR_EXCL;
1668 else
1669 mask |= CEPH_SETATTR_CTIME;
1670 }
1671 }
1672 if (ia_valid & ATTR_FILE)
1673 dout("setattr %p ATTR_FILE ... hrm!\n", inode);
1674
1675 if (dirtied) {
1676 __ceph_mark_dirty_caps(ci, dirtied);
1677 inode->i_ctime = CURRENT_TIME;
1678 }
1679
1680 release &= issued;
1681 spin_unlock(&inode->i_lock);
1682
1683 if (mask) {
1684 req->r_inode = igrab(inode);
1685 req->r_inode_drop = release;
1686 req->r_args.setattr.mask = cpu_to_le32(mask);
1687 req->r_num_caps = 1;
1688 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
1689 }
1690 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
1691 ceph_cap_string(dirtied), mask);
1692
1693 ceph_mdsc_put_request(req);
1694 __ceph_do_pending_vmtruncate(inode);
1695 return err;
1696out:
1697 spin_unlock(&inode->i_lock);
1698 ceph_mdsc_put_request(req);
1699 return err;
1700}
1701
1702/*
1703 * Verify that we have a lease on the given mask. If not,
1704 * do a getattr against an mds.
1705 */
1706int ceph_do_getattr(struct inode *inode, int mask)
1707{
1708 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
1709 struct ceph_mds_client *mdsc = &client->mdsc;
1710 struct ceph_mds_request *req;
1711 int err;
1712
1713 if (ceph_snap(inode) == CEPH_SNAPDIR) {
1714 dout("do_getattr inode %p SNAPDIR\n", inode);
1715 return 0;
1716 }
1717
1718 dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
1719 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
1720 return 0;
1721
1722 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
1723 if (IS_ERR(req))
1724 return PTR_ERR(req);
1725 req->r_inode = igrab(inode);
1726 req->r_num_caps = 1;
1727 req->r_args.getattr.mask = cpu_to_le32(mask);
1728 err = ceph_mdsc_do_request(mdsc, NULL, req);
1729 ceph_mdsc_put_request(req);
1730 dout("do_getattr result=%d\n", err);
1731 return err;
1732}
1733
1734
1735/*
1736 * Check inode permissions. We verify we have a valid value for
1737 * the AUTH cap, then call the generic handler.
1738 */
1739int ceph_permission(struct inode *inode, int mask)
1740{
1741 int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
1742
1743 if (!err)
1744 err = generic_permission(inode, mask, NULL);
1745 return err;
1746}
1747
1748/*
1749 * Get all attributes. Hopefully somedata we'll have a statlite()
1750 * and can limit the fields we require to be accurate.
1751 */
1752int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
1753 struct kstat *stat)
1754{
1755 struct inode *inode = dentry->d_inode;
1756 struct ceph_inode_info *ci = ceph_inode(inode);
1757 int err;
1758
1759 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
1760 if (!err) {
1761 generic_fillattr(inode, stat);
1762 stat->ino = inode->i_ino;
1763 if (ceph_snap(inode) != CEPH_NOSNAP)
1764 stat->dev = ceph_snap(inode);
1765 else
1766 stat->dev = 0;
1767 if (S_ISDIR(inode->i_mode)) {
1768 stat->size = ci->i_rbytes;
1769 stat->blocks = 0;
1770 stat->blksize = 65536;
1771 }
1772 }
1773 return err;
1774}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 000000000000..8a5bcae62846
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,160 @@
1#include <linux/in.h>
2
3#include "ioctl.h"
4#include "super.h"
5#include "ceph_debug.h"
6
7
8/*
9 * ioctls
10 */
11
12/*
13 * get and set the file layout
14 */
15static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
16{
17 struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
18 struct ceph_ioctl_layout l;
19 int err;
20
21 err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
22 if (!err) {
23 l.stripe_unit = ceph_file_layout_su(ci->i_layout);
24 l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
25 l.object_size = ceph_file_layout_object_size(ci->i_layout);
26 l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
27 l.preferred_osd =
28 (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
29 if (copy_to_user(arg, &l, sizeof(l)))
30 return -EFAULT;
31 }
32
33 return err;
34}
35
36static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
37{
38 struct inode *inode = file->f_dentry->d_inode;
39 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
40 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
41 struct ceph_mds_request *req;
42 struct ceph_ioctl_layout l;
43 int err, i;
44
45 /* copy and validate */
46 if (copy_from_user(&l, arg, sizeof(l)))
47 return -EFAULT;
48
49 if ((l.object_size & ~PAGE_MASK) ||
50 (l.stripe_unit & ~PAGE_MASK) ||
51 !l.stripe_unit ||
52 (l.object_size &&
53 (unsigned)l.object_size % (unsigned)l.stripe_unit))
54 return -EINVAL;
55
56 /* make sure it's a valid data pool */
57 if (l.data_pool > 0) {
58 mutex_lock(&mdsc->mutex);
59 err = -EINVAL;
60 for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
61 if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
62 err = 0;
63 break;
64 }
65 mutex_unlock(&mdsc->mutex);
66 if (err)
67 return err;
68 }
69
70 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
71 USE_AUTH_MDS);
72 if (IS_ERR(req))
73 return PTR_ERR(req);
74 req->r_inode = igrab(inode);
75 req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
76
77 req->r_args.setlayout.layout.fl_stripe_unit =
78 cpu_to_le32(l.stripe_unit);
79 req->r_args.setlayout.layout.fl_stripe_count =
80 cpu_to_le32(l.stripe_count);
81 req->r_args.setlayout.layout.fl_object_size =
82 cpu_to_le32(l.object_size);
83 req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
84 req->r_args.setlayout.layout.fl_pg_preferred =
85 cpu_to_le32(l.preferred_osd);
86
87 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
88 ceph_mdsc_put_request(req);
89 return err;
90}
91
92/*
93 * Return object name, size/offset information, and location (OSD
94 * number, network address) for a given file offset.
95 */
96static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
97{
98 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
102 u64 len = 1, olen;
103 u64 tmp;
104 struct ceph_object_layout ol;
105 struct ceph_pg pgid;
106
107 /* copy and validate */
108 if (copy_from_user(&dl, arg, sizeof(dl)))
109 return -EFAULT;
110
111 down_read(&osdc->map_sem);
112 ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
113 &dl.object_no, &dl.object_offset, &olen);
114 dl.file_offset -= dl.object_offset;
115 dl.object_size = ceph_file_layout_object_size(ci->i_layout);
116 dl.block_size = ceph_file_layout_su(ci->i_layout);
117
118 /* block_offset = object_offset % block_size */
119 tmp = dl.object_offset;
120 dl.block_offset = do_div(tmp, dl.block_size);
121
122 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
123 ceph_ino(inode), dl.object_no);
124 ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
125 osdc->osdmap);
126
127 pgid = ol.ol_pgid;
128 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
129 if (dl.osd >= 0) {
130 struct ceph_entity_addr *a =
131 ceph_osd_addr(osdc->osdmap, dl.osd);
132 if (a)
133 memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
134 } else {
135 memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
136 }
137 up_read(&osdc->map_sem);
138
139 /* send result back to user */
140 if (copy_to_user(arg, &dl, sizeof(dl)))
141 return -EFAULT;
142
143 return 0;
144}
145
146long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
147{
148 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
149 switch (cmd) {
150 case CEPH_IOC_GET_LAYOUT:
151 return ceph_ioctl_get_layout(file, (void __user *)arg);
152
153 case CEPH_IOC_SET_LAYOUT:
154 return ceph_ioctl_set_layout(file, (void __user *)arg);
155
156 case CEPH_IOC_GET_DATALOC:
157 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
158 }
159 return -ENOTTY;
160}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 000000000000..25e4f1a9d059
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,40 @@
1#ifndef FS_CEPH_IOCTL_H
2#define FS_CEPH_IOCTL_H
3
4#include <linux/ioctl.h>
5#include <linux/types.h>
6
7#define CEPH_IOCTL_MAGIC 0x97
8
9/* just use u64 to align sanely on all archs */
10struct ceph_ioctl_layout {
11 __u64 stripe_unit, stripe_count, object_size;
12 __u64 data_pool;
13 __s64 preferred_osd;
14};
15
16#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \
17 struct ceph_ioctl_layout)
18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
19 struct ceph_ioctl_layout)
20
21/*
22 * Extract identity, address of the OSD and object storing a given
23 * file offset.
24 */
25struct ceph_ioctl_dataloc {
26 __u64 file_offset; /* in+out: file offset */
27 __u64 object_offset; /* out: offset in object */
28 __u64 object_no; /* out: object # */
29 __u64 object_size; /* out: object size */
30 char object_name[64]; /* out: object name */
31 __u64 block_offset; /* out: offset in block */
32 __u64 block_size; /* out: block length */
33 __s64 osd; /* out: osd # */
34 struct sockaddr_storage osd_addr; /* out: osd address */
35};
36
37#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
38 struct ceph_ioctl_dataloc)
39
40#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 000000000000..60a9a4ae47be
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,3043 @@
1#include "ceph_debug.h"
2
3#include <linux/wait.h>
4#include <linux/slab.h>
5#include <linux/sched.h>
6
7#include "mds_client.h"
8#include "mon_client.h"
9#include "super.h"
10#include "messenger.h"
11#include "decode.h"
12#include "auth.h"
13#include "pagelist.h"
14
15/*
16 * A cluster of MDS (metadata server) daemons is responsible for
17 * managing the file system namespace (the directory hierarchy and
18 * inodes) and for coordinating shared access to storage. Metadata is
19 * partitioning hierarchically across a number of servers, and that
20 * partition varies over time as the cluster adjusts the distribution
21 * in order to balance load.
22 *
23 * The MDS client is primarily responsible to managing synchronous
24 * metadata requests for operations like open, unlink, and so forth.
25 * If there is a MDS failure, we find out about it when we (possibly
26 * request and) receive a new MDS map, and can resubmit affected
27 * requests.
28 *
29 * For the most part, though, we take advantage of a lossless
30 * communications channel to the MDS, and do not need to worry about
31 * timing out or resubmitting requests.
32 *
33 * We maintain a stateful "session" with each MDS we interact with.
34 * Within each session, we sent periodic heartbeat messages to ensure
35 * any capabilities or leases we have been issues remain valid. If
36 * the session times out and goes stale, our leases and capabilities
37 * are no longer valid.
38 */
39
40static void __wake_requests(struct ceph_mds_client *mdsc,
41 struct list_head *head);
42
43const static struct ceph_connection_operations mds_con_ops;
44
45
46/*
47 * mds reply parsing
48 */
49
50/*
51 * parse individual inode info
52 */
53static int parse_reply_info_in(void **p, void *end,
54 struct ceph_mds_reply_info_in *info)
55{
56 int err = -EIO;
57
58 info->in = *p;
59 *p += sizeof(struct ceph_mds_reply_inode) +
60 sizeof(*info->in->fragtree.splits) *
61 le32_to_cpu(info->in->fragtree.nsplits);
62
63 ceph_decode_32_safe(p, end, info->symlink_len, bad);
64 ceph_decode_need(p, end, info->symlink_len, bad);
65 info->symlink = *p;
66 *p += info->symlink_len;
67
68 ceph_decode_32_safe(p, end, info->xattr_len, bad);
69 ceph_decode_need(p, end, info->xattr_len, bad);
70 info->xattr_data = *p;
71 *p += info->xattr_len;
72 return 0;
73bad:
74 return err;
75}
76
77/*
78 * parse a normal reply, which may contain a (dir+)dentry and/or a
79 * target inode.
80 */
81static int parse_reply_info_trace(void **p, void *end,
82 struct ceph_mds_reply_info_parsed *info)
83{
84 int err;
85
86 if (info->head->is_dentry) {
87 err = parse_reply_info_in(p, end, &info->diri);
88 if (err < 0)
89 goto out_bad;
90
91 if (unlikely(*p + sizeof(*info->dirfrag) > end))
92 goto bad;
93 info->dirfrag = *p;
94 *p += sizeof(*info->dirfrag) +
95 sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
96 if (unlikely(*p > end))
97 goto bad;
98
99 ceph_decode_32_safe(p, end, info->dname_len, bad);
100 ceph_decode_need(p, end, info->dname_len, bad);
101 info->dname = *p;
102 *p += info->dname_len;
103 info->dlease = *p;
104 *p += sizeof(*info->dlease);
105 }
106
107 if (info->head->is_target) {
108 err = parse_reply_info_in(p, end, &info->targeti);
109 if (err < 0)
110 goto out_bad;
111 }
112
113 if (unlikely(*p != end))
114 goto bad;
115 return 0;
116
117bad:
118 err = -EIO;
119out_bad:
120 pr_err("problem parsing mds trace %d\n", err);
121 return err;
122}
123
124/*
125 * parse readdir results
126 */
127static int parse_reply_info_dir(void **p, void *end,
128 struct ceph_mds_reply_info_parsed *info)
129{
130 u32 num, i = 0;
131 int err;
132
133 info->dir_dir = *p;
134 if (*p + sizeof(*info->dir_dir) > end)
135 goto bad;
136 *p += sizeof(*info->dir_dir) +
137 sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
138 if (*p > end)
139 goto bad;
140
141 ceph_decode_need(p, end, sizeof(num) + 2, bad);
142 num = ceph_decode_32(p);
143 info->dir_end = ceph_decode_8(p);
144 info->dir_complete = ceph_decode_8(p);
145 if (num == 0)
146 goto done;
147
148 /* alloc large array */
149 info->dir_nr = num;
150 info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
151 sizeof(*info->dir_dname) +
152 sizeof(*info->dir_dname_len) +
153 sizeof(*info->dir_dlease),
154 GFP_NOFS);
155 if (info->dir_in == NULL) {
156 err = -ENOMEM;
157 goto out_bad;
158 }
159 info->dir_dname = (void *)(info->dir_in + num);
160 info->dir_dname_len = (void *)(info->dir_dname + num);
161 info->dir_dlease = (void *)(info->dir_dname_len + num);
162
163 while (num) {
164 /* dentry */
165 ceph_decode_need(p, end, sizeof(u32)*2, bad);
166 info->dir_dname_len[i] = ceph_decode_32(p);
167 ceph_decode_need(p, end, info->dir_dname_len[i], bad);
168 info->dir_dname[i] = *p;
169 *p += info->dir_dname_len[i];
170 dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
171 info->dir_dname[i]);
172 info->dir_dlease[i] = *p;
173 *p += sizeof(struct ceph_mds_reply_lease);
174
175 /* inode */
176 err = parse_reply_info_in(p, end, &info->dir_in[i]);
177 if (err < 0)
178 goto out_bad;
179 i++;
180 num--;
181 }
182
183done:
184 if (*p != end)
185 goto bad;
186 return 0;
187
188bad:
189 err = -EIO;
190out_bad:
191 pr_err("problem parsing dir contents %d\n", err);
192 return err;
193}
194
195/*
196 * parse entire mds reply
197 */
198static int parse_reply_info(struct ceph_msg *msg,
199 struct ceph_mds_reply_info_parsed *info)
200{
201 void *p, *end;
202 u32 len;
203 int err;
204
205 info->head = msg->front.iov_base;
206 p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
207 end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
208
209 /* trace */
210 ceph_decode_32_safe(&p, end, len, bad);
211 if (len > 0) {
212 err = parse_reply_info_trace(&p, p+len, info);
213 if (err < 0)
214 goto out_bad;
215 }
216
217 /* dir content */
218 ceph_decode_32_safe(&p, end, len, bad);
219 if (len > 0) {
220 err = parse_reply_info_dir(&p, p+len, info);
221 if (err < 0)
222 goto out_bad;
223 }
224
225 /* snap blob */
226 ceph_decode_32_safe(&p, end, len, bad);
227 info->snapblob_len = len;
228 info->snapblob = p;
229 p += len;
230
231 if (p != end)
232 goto bad;
233 return 0;
234
235bad:
236 err = -EIO;
237out_bad:
238 pr_err("mds parse_reply err %d\n", err);
239 return err;
240}
241
242static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
243{
244 kfree(info->dir_in);
245}
246
247
248/*
249 * sessions
250 */
251static const char *session_state_name(int s)
252{
253 switch (s) {
254 case CEPH_MDS_SESSION_NEW: return "new";
255 case CEPH_MDS_SESSION_OPENING: return "opening";
256 case CEPH_MDS_SESSION_OPEN: return "open";
257 case CEPH_MDS_SESSION_HUNG: return "hung";
258 case CEPH_MDS_SESSION_CLOSING: return "closing";
259 case CEPH_MDS_SESSION_RESTARTING: return "restarting";
260 case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
261 default: return "???";
262 }
263}
264
265static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
266{
267 if (atomic_inc_not_zero(&s->s_ref)) {
268 dout("mdsc get_session %p %d -> %d\n", s,
269 atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
270 return s;
271 } else {
272 dout("mdsc get_session %p 0 -- FAIL", s);
273 return NULL;
274 }
275}
276
277void ceph_put_mds_session(struct ceph_mds_session *s)
278{
279 dout("mdsc put_session %p %d -> %d\n", s,
280 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
281 if (atomic_dec_and_test(&s->s_ref)) {
282 if (s->s_authorizer)
283 s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
284 s->s_mdsc->client->monc.auth, s->s_authorizer);
285 kfree(s);
286 }
287}
288
289/*
290 * called under mdsc->mutex
291 */
292struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
293 int mds)
294{
295 struct ceph_mds_session *session;
296
297 if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
298 return NULL;
299 session = mdsc->sessions[mds];
300 dout("lookup_mds_session %p %d\n", session,
301 atomic_read(&session->s_ref));
302 get_session(session);
303 return session;
304}
305
306static bool __have_session(struct ceph_mds_client *mdsc, int mds)
307{
308 if (mds >= mdsc->max_sessions)
309 return false;
310 return mdsc->sessions[mds];
311}
312
313static int __verify_registered_session(struct ceph_mds_client *mdsc,
314 struct ceph_mds_session *s)
315{
316 if (s->s_mds >= mdsc->max_sessions ||
317 mdsc->sessions[s->s_mds] != s)
318 return -ENOENT;
319 return 0;
320}
321
322/*
323 * create+register a new session for given mds.
324 * called under mdsc->mutex.
325 */
326static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
327 int mds)
328{
329 struct ceph_mds_session *s;
330
331 s = kzalloc(sizeof(*s), GFP_NOFS);
332 if (!s)
333 return ERR_PTR(-ENOMEM);
334 s->s_mdsc = mdsc;
335 s->s_mds = mds;
336 s->s_state = CEPH_MDS_SESSION_NEW;
337 s->s_ttl = 0;
338 s->s_seq = 0;
339 mutex_init(&s->s_mutex);
340
341 ceph_con_init(mdsc->client->msgr, &s->s_con);
342 s->s_con.private = s;
343 s->s_con.ops = &mds_con_ops;
344 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
345 s->s_con.peer_name.num = cpu_to_le64(mds);
346
347 spin_lock_init(&s->s_cap_lock);
348 s->s_cap_gen = 0;
349 s->s_cap_ttl = 0;
350 s->s_renew_requested = 0;
351 s->s_renew_seq = 0;
352 INIT_LIST_HEAD(&s->s_caps);
353 s->s_nr_caps = 0;
354 s->s_trim_caps = 0;
355 atomic_set(&s->s_ref, 1);
356 INIT_LIST_HEAD(&s->s_waiting);
357 INIT_LIST_HEAD(&s->s_unsafe);
358 s->s_num_cap_releases = 0;
359 s->s_cap_iterator = NULL;
360 INIT_LIST_HEAD(&s->s_cap_releases);
361 INIT_LIST_HEAD(&s->s_cap_releases_done);
362 INIT_LIST_HEAD(&s->s_cap_flushing);
363 INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
364
365 dout("register_session mds%d\n", mds);
366 if (mds >= mdsc->max_sessions) {
367 int newmax = 1 << get_count_order(mds+1);
368 struct ceph_mds_session **sa;
369
370 dout("register_session realloc to %d\n", newmax);
371 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
372 if (sa == NULL)
373 goto fail_realloc;
374 if (mdsc->sessions) {
375 memcpy(sa, mdsc->sessions,
376 mdsc->max_sessions * sizeof(void *));
377 kfree(mdsc->sessions);
378 }
379 mdsc->sessions = sa;
380 mdsc->max_sessions = newmax;
381 }
382 mdsc->sessions[mds] = s;
383 atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
384
385 ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
386
387 return s;
388
389fail_realloc:
390 kfree(s);
391 return ERR_PTR(-ENOMEM);
392}
393
394/*
395 * called under mdsc->mutex
396 */
397static void __unregister_session(struct ceph_mds_client *mdsc,
398 struct ceph_mds_session *s)
399{
400 dout("__unregister_session mds%d %p\n", s->s_mds, s);
401 BUG_ON(mdsc->sessions[s->s_mds] != s);
402 mdsc->sessions[s->s_mds] = NULL;
403 ceph_con_close(&s->s_con);
404 ceph_put_mds_session(s);
405}
406
407/*
408 * drop session refs in request.
409 *
410 * should be last request ref, or hold mdsc->mutex
411 */
412static void put_request_session(struct ceph_mds_request *req)
413{
414 if (req->r_session) {
415 ceph_put_mds_session(req->r_session);
416 req->r_session = NULL;
417 }
418}
419
420void ceph_mdsc_release_request(struct kref *kref)
421{
422 struct ceph_mds_request *req = container_of(kref,
423 struct ceph_mds_request,
424 r_kref);
425 if (req->r_request)
426 ceph_msg_put(req->r_request);
427 if (req->r_reply) {
428 ceph_msg_put(req->r_reply);
429 destroy_reply_info(&req->r_reply_info);
430 }
431 if (req->r_inode) {
432 ceph_put_cap_refs(ceph_inode(req->r_inode),
433 CEPH_CAP_PIN);
434 iput(req->r_inode);
435 }
436 if (req->r_locked_dir)
437 ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
438 CEPH_CAP_PIN);
439 if (req->r_target_inode)
440 iput(req->r_target_inode);
441 if (req->r_dentry)
442 dput(req->r_dentry);
443 if (req->r_old_dentry) {
444 ceph_put_cap_refs(
445 ceph_inode(req->r_old_dentry->d_parent->d_inode),
446 CEPH_CAP_PIN);
447 dput(req->r_old_dentry);
448 }
449 kfree(req->r_path1);
450 kfree(req->r_path2);
451 put_request_session(req);
452 ceph_unreserve_caps(&req->r_caps_reservation);
453 kfree(req);
454}
455
456/*
457 * lookup session, bump ref if found.
458 *
459 * called under mdsc->mutex.
460 */
461static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
462 u64 tid)
463{
464 struct ceph_mds_request *req;
465 struct rb_node *n = mdsc->request_tree.rb_node;
466
467 while (n) {
468 req = rb_entry(n, struct ceph_mds_request, r_node);
469 if (tid < req->r_tid)
470 n = n->rb_left;
471 else if (tid > req->r_tid)
472 n = n->rb_right;
473 else {
474 ceph_mdsc_get_request(req);
475 return req;
476 }
477 }
478 return NULL;
479}
480
481static void __insert_request(struct ceph_mds_client *mdsc,
482 struct ceph_mds_request *new)
483{
484 struct rb_node **p = &mdsc->request_tree.rb_node;
485 struct rb_node *parent = NULL;
486 struct ceph_mds_request *req = NULL;
487
488 while (*p) {
489 parent = *p;
490 req = rb_entry(parent, struct ceph_mds_request, r_node);
491 if (new->r_tid < req->r_tid)
492 p = &(*p)->rb_left;
493 else if (new->r_tid > req->r_tid)
494 p = &(*p)->rb_right;
495 else
496 BUG();
497 }
498
499 rb_link_node(&new->r_node, parent, p);
500 rb_insert_color(&new->r_node, &mdsc->request_tree);
501}
502
503/*
504 * Register an in-flight request, and assign a tid. Link to directory
505 * are modifying (if any).
506 *
507 * Called under mdsc->mutex.
508 */
509static void __register_request(struct ceph_mds_client *mdsc,
510 struct ceph_mds_request *req,
511 struct inode *dir)
512{
513 req->r_tid = ++mdsc->last_tid;
514 if (req->r_num_caps)
515 ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
516 dout("__register_request %p tid %lld\n", req, req->r_tid);
517 ceph_mdsc_get_request(req);
518 __insert_request(mdsc, req);
519
520 if (dir) {
521 struct ceph_inode_info *ci = ceph_inode(dir);
522
523 spin_lock(&ci->i_unsafe_lock);
524 req->r_unsafe_dir = dir;
525 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
526 spin_unlock(&ci->i_unsafe_lock);
527 }
528}
529
530static void __unregister_request(struct ceph_mds_client *mdsc,
531 struct ceph_mds_request *req)
532{
533 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
534 rb_erase(&req->r_node, &mdsc->request_tree);
535 RB_CLEAR_NODE(&req->r_node);
536
537 if (req->r_unsafe_dir) {
538 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
539
540 spin_lock(&ci->i_unsafe_lock);
541 list_del_init(&req->r_unsafe_dir_item);
542 spin_unlock(&ci->i_unsafe_lock);
543 }
544
545 ceph_mdsc_put_request(req);
546}
547
548/*
549 * Choose mds to send request to next. If there is a hint set in the
550 * request (e.g., due to a prior forward hint from the mds), use that.
551 * Otherwise, consult frag tree and/or caps to identify the
552 * appropriate mds. If all else fails, choose randomly.
553 *
554 * Called under mdsc->mutex.
555 */
556static int __choose_mds(struct ceph_mds_client *mdsc,
557 struct ceph_mds_request *req)
558{
559 struct inode *inode;
560 struct ceph_inode_info *ci;
561 struct ceph_cap *cap;
562 int mode = req->r_direct_mode;
563 int mds = -1;
564 u32 hash = req->r_direct_hash;
565 bool is_hash = req->r_direct_is_hash;
566
567 /*
568 * is there a specific mds we should try? ignore hint if we have
569 * no session and the mds is not up (active or recovering).
570 */
571 if (req->r_resend_mds >= 0 &&
572 (__have_session(mdsc, req->r_resend_mds) ||
573 ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
574 dout("choose_mds using resend_mds mds%d\n",
575 req->r_resend_mds);
576 return req->r_resend_mds;
577 }
578
579 if (mode == USE_RANDOM_MDS)
580 goto random;
581
582 inode = NULL;
583 if (req->r_inode) {
584 inode = req->r_inode;
585 } else if (req->r_dentry) {
586 if (req->r_dentry->d_inode) {
587 inode = req->r_dentry->d_inode;
588 } else {
589 inode = req->r_dentry->d_parent->d_inode;
590 hash = req->r_dentry->d_name.hash;
591 is_hash = true;
592 }
593 }
594 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
595 (int)hash, mode);
596 if (!inode)
597 goto random;
598 ci = ceph_inode(inode);
599
600 if (is_hash && S_ISDIR(inode->i_mode)) {
601 struct ceph_inode_frag frag;
602 int found;
603
604 ceph_choose_frag(ci, hash, &frag, &found);
605 if (found) {
606 if (mode == USE_ANY_MDS && frag.ndist > 0) {
607 u8 r;
608
609 /* choose a random replica */
610 get_random_bytes(&r, 1);
611 r %= frag.ndist;
612 mds = frag.dist[r];
613 dout("choose_mds %p %llx.%llx "
614 "frag %u mds%d (%d/%d)\n",
615 inode, ceph_vinop(inode),
616 frag.frag, frag.mds,
617 (int)r, frag.ndist);
618 return mds;
619 }
620
621 /* since this file/dir wasn't known to be
622 * replicated, then we want to look for the
623 * authoritative mds. */
624 mode = USE_AUTH_MDS;
625 if (frag.mds >= 0) {
626 /* choose auth mds */
627 mds = frag.mds;
628 dout("choose_mds %p %llx.%llx "
629 "frag %u mds%d (auth)\n",
630 inode, ceph_vinop(inode), frag.frag, mds);
631 return mds;
632 }
633 }
634 }
635
636 spin_lock(&inode->i_lock);
637 cap = NULL;
638 if (mode == USE_AUTH_MDS)
639 cap = ci->i_auth_cap;
640 if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
641 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
642 if (!cap) {
643 spin_unlock(&inode->i_lock);
644 goto random;
645 }
646 mds = cap->session->s_mds;
647 dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
648 inode, ceph_vinop(inode), mds,
649 cap == ci->i_auth_cap ? "auth " : "", cap);
650 spin_unlock(&inode->i_lock);
651 return mds;
652
653random:
654 mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
655 dout("choose_mds chose random mds%d\n", mds);
656 return mds;
657}
658
659
660/*
661 * session messages
662 */
663static struct ceph_msg *create_session_msg(u32 op, u64 seq)
664{
665 struct ceph_msg *msg;
666 struct ceph_mds_session_head *h;
667
668 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
669 if (IS_ERR(msg)) {
670 pr_err("create_session_msg ENOMEM creating msg\n");
671 return ERR_PTR(PTR_ERR(msg));
672 }
673 h = msg->front.iov_base;
674 h->op = cpu_to_le32(op);
675 h->seq = cpu_to_le64(seq);
676 return msg;
677}
678
679/*
680 * send session open request.
681 *
682 * called under mdsc->mutex
683 */
684static int __open_session(struct ceph_mds_client *mdsc,
685 struct ceph_mds_session *session)
686{
687 struct ceph_msg *msg;
688 int mstate;
689 int mds = session->s_mds;
690 int err = 0;
691
692 /* wait for mds to go active? */
693 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
694 dout("open_session to mds%d (%s)\n", mds,
695 ceph_mds_state_name(mstate));
696 session->s_state = CEPH_MDS_SESSION_OPENING;
697 session->s_renew_requested = jiffies;
698
699 /* send connect message */
700 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
701 if (IS_ERR(msg)) {
702 err = PTR_ERR(msg);
703 goto out;
704 }
705 ceph_con_send(&session->s_con, msg);
706
707out:
708 return 0;
709}
710
711/*
712 * session caps
713 */
714
715/*
716 * Free preallocated cap messages assigned to this session
717 */
718static void cleanup_cap_releases(struct ceph_mds_session *session)
719{
720 struct ceph_msg *msg;
721
722 spin_lock(&session->s_cap_lock);
723 while (!list_empty(&session->s_cap_releases)) {
724 msg = list_first_entry(&session->s_cap_releases,
725 struct ceph_msg, list_head);
726 list_del_init(&msg->list_head);
727 ceph_msg_put(msg);
728 }
729 while (!list_empty(&session->s_cap_releases_done)) {
730 msg = list_first_entry(&session->s_cap_releases_done,
731 struct ceph_msg, list_head);
732 list_del_init(&msg->list_head);
733 ceph_msg_put(msg);
734 }
735 spin_unlock(&session->s_cap_lock);
736}
737
738/*
739 * Helper to safely iterate over all caps associated with a session.
740 *
741 * caller must hold session s_mutex
742 */
743static int iterate_session_caps(struct ceph_mds_session *session,
744 int (*cb)(struct inode *, struct ceph_cap *,
745 void *), void *arg)
746{
747 struct list_head *p;
748 struct ceph_cap *cap;
749 struct inode *inode, *last_inode = NULL;
750 struct ceph_cap *old_cap = NULL;
751 int ret;
752
753 dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
754 spin_lock(&session->s_cap_lock);
755 p = session->s_caps.next;
756 while (p != &session->s_caps) {
757 cap = list_entry(p, struct ceph_cap, session_caps);
758 inode = igrab(&cap->ci->vfs_inode);
759 if (!inode) {
760 p = p->next;
761 continue;
762 }
763 session->s_cap_iterator = cap;
764 spin_unlock(&session->s_cap_lock);
765
766 if (last_inode) {
767 iput(last_inode);
768 last_inode = NULL;
769 }
770 if (old_cap) {
771 ceph_put_cap(old_cap);
772 old_cap = NULL;
773 }
774
775 ret = cb(inode, cap, arg);
776 last_inode = inode;
777
778 spin_lock(&session->s_cap_lock);
779 p = p->next;
780 if (cap->ci == NULL) {
781 dout("iterate_session_caps finishing cap %p removal\n",
782 cap);
783 BUG_ON(cap->session != session);
784 list_del_init(&cap->session_caps);
785 session->s_nr_caps--;
786 cap->session = NULL;
787 old_cap = cap; /* put_cap it w/o locks held */
788 }
789 if (ret < 0)
790 goto out;
791 }
792 ret = 0;
793out:
794 session->s_cap_iterator = NULL;
795 spin_unlock(&session->s_cap_lock);
796
797 if (last_inode)
798 iput(last_inode);
799 if (old_cap)
800 ceph_put_cap(old_cap);
801
802 return ret;
803}
804
805static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
806 void *arg)
807{
808 struct ceph_inode_info *ci = ceph_inode(inode);
809 dout("removing cap %p, ci is %p, inode is %p\n",
810 cap, ci, &ci->vfs_inode);
811 ceph_remove_cap(cap);
812 return 0;
813}
814
815/*
816 * caller must hold session s_mutex
817 */
818static void remove_session_caps(struct ceph_mds_session *session)
819{
820 dout("remove_session_caps on %p\n", session);
821 iterate_session_caps(session, remove_session_caps_cb, NULL);
822 BUG_ON(session->s_nr_caps > 0);
823 cleanup_cap_releases(session);
824}
825
826/*
827 * wake up any threads waiting on this session's caps. if the cap is
828 * old (didn't get renewed on the client reconnect), remove it now.
829 *
830 * caller must hold s_mutex.
831 */
832static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
833 void *arg)
834{
835 struct ceph_inode_info *ci = ceph_inode(inode);
836
837 wake_up(&ci->i_cap_wq);
838 if (arg) {
839 spin_lock(&inode->i_lock);
840 ci->i_wanted_max_size = 0;
841 ci->i_requested_max_size = 0;
842 spin_unlock(&inode->i_lock);
843 }
844 return 0;
845}
846
847static void wake_up_session_caps(struct ceph_mds_session *session,
848 int reconnect)
849{
850 dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
851 iterate_session_caps(session, wake_up_session_cb,
852 (void *)(unsigned long)reconnect);
853}
854
855/*
856 * Send periodic message to MDS renewing all currently held caps. The
857 * ack will reset the expiration for all caps from this session.
858 *
859 * caller holds s_mutex
860 */
861static int send_renew_caps(struct ceph_mds_client *mdsc,
862 struct ceph_mds_session *session)
863{
864 struct ceph_msg *msg;
865 int state;
866
867 if (time_after_eq(jiffies, session->s_cap_ttl) &&
868 time_after_eq(session->s_cap_ttl, session->s_renew_requested))
869 pr_info("mds%d caps stale\n", session->s_mds);
870 session->s_renew_requested = jiffies;
871
872 /* do not try to renew caps until a recovering mds has reconnected
873 * with its clients. */
874 state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
875 if (state < CEPH_MDS_STATE_RECONNECT) {
876 dout("send_renew_caps ignoring mds%d (%s)\n",
877 session->s_mds, ceph_mds_state_name(state));
878 return 0;
879 }
880
881 dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
882 ceph_mds_state_name(state));
883 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
884 ++session->s_renew_seq);
885 if (IS_ERR(msg))
886 return PTR_ERR(msg);
887 ceph_con_send(&session->s_con, msg);
888 return 0;
889}
890
891/*
892 * Note new cap ttl, and any transition from stale -> not stale (fresh?).
893 *
894 * Called under session->s_mutex
895 */
896static void renewed_caps(struct ceph_mds_client *mdsc,
897 struct ceph_mds_session *session, int is_renew)
898{
899 int was_stale;
900 int wake = 0;
901
902 spin_lock(&session->s_cap_lock);
903 was_stale = is_renew && (session->s_cap_ttl == 0 ||
904 time_after_eq(jiffies, session->s_cap_ttl));
905
906 session->s_cap_ttl = session->s_renew_requested +
907 mdsc->mdsmap->m_session_timeout*HZ;
908
909 if (was_stale) {
910 if (time_before(jiffies, session->s_cap_ttl)) {
911 pr_info("mds%d caps renewed\n", session->s_mds);
912 wake = 1;
913 } else {
914 pr_info("mds%d caps still stale\n", session->s_mds);
915 }
916 }
917 dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
918 session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
919 time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
920 spin_unlock(&session->s_cap_lock);
921
922 if (wake)
923 wake_up_session_caps(session, 0);
924}
925
926/*
927 * send a session close request
928 */
929static int request_close_session(struct ceph_mds_client *mdsc,
930 struct ceph_mds_session *session)
931{
932 struct ceph_msg *msg;
933 int err = 0;
934
935 dout("request_close_session mds%d state %s seq %lld\n",
936 session->s_mds, session_state_name(session->s_state),
937 session->s_seq);
938 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
939 if (IS_ERR(msg))
940 err = PTR_ERR(msg);
941 else
942 ceph_con_send(&session->s_con, msg);
943 return err;
944}
945
946/*
947 * Called with s_mutex held.
948 */
949static int __close_session(struct ceph_mds_client *mdsc,
950 struct ceph_mds_session *session)
951{
952 if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
953 return 0;
954 session->s_state = CEPH_MDS_SESSION_CLOSING;
955 return request_close_session(mdsc, session);
956}
957
958/*
959 * Trim old(er) caps.
960 *
961 * Because we can't cache an inode without one or more caps, we do
962 * this indirectly: if a cap is unused, we prune its aliases, at which
963 * point the inode will hopefully get dropped to.
964 *
965 * Yes, this is a bit sloppy. Our only real goal here is to respond to
966 * memory pressure from the MDS, though, so it needn't be perfect.
967 */
968static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
969{
970 struct ceph_mds_session *session = arg;
971 struct ceph_inode_info *ci = ceph_inode(inode);
972 int used, oissued, mine;
973
974 if (session->s_trim_caps <= 0)
975 return -1;
976
977 spin_lock(&inode->i_lock);
978 mine = cap->issued | cap->implemented;
979 used = __ceph_caps_used(ci);
980 oissued = __ceph_caps_issued_other(ci, cap);
981
982 dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
983 inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
984 ceph_cap_string(used));
985 if (ci->i_dirty_caps)
986 goto out; /* dirty caps */
987 if ((used & ~oissued) & mine)
988 goto out; /* we need these caps */
989
990 session->s_trim_caps--;
991 if (oissued) {
992 /* we aren't the only cap.. just remove us */
993 __ceph_remove_cap(cap);
994 } else {
995 /* try to drop referring dentries */
996 spin_unlock(&inode->i_lock);
997 d_prune_aliases(inode);
998 dout("trim_caps_cb %p cap %p pruned, count now %d\n",
999 inode, cap, atomic_read(&inode->i_count));
1000 return 0;
1001 }
1002
1003out:
1004 spin_unlock(&inode->i_lock);
1005 return 0;
1006}
1007
1008/*
1009 * Trim session cap count down to some max number.
1010 */
1011static int trim_caps(struct ceph_mds_client *mdsc,
1012 struct ceph_mds_session *session,
1013 int max_caps)
1014{
1015 int trim_caps = session->s_nr_caps - max_caps;
1016
1017 dout("trim_caps mds%d start: %d / %d, trim %d\n",
1018 session->s_mds, session->s_nr_caps, max_caps, trim_caps);
1019 if (trim_caps > 0) {
1020 session->s_trim_caps = trim_caps;
1021 iterate_session_caps(session, trim_caps_cb, session);
1022 dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
1023 session->s_mds, session->s_nr_caps, max_caps,
1024 trim_caps - session->s_trim_caps);
1025 session->s_trim_caps = 0;
1026 }
1027 return 0;
1028}
1029
1030/*
1031 * Allocate cap_release messages. If there is a partially full message
1032 * in the queue, try to allocate enough to cover it's remainder, so that
1033 * we can send it immediately.
1034 *
1035 * Called under s_mutex.
1036 */
1037static int add_cap_releases(struct ceph_mds_client *mdsc,
1038 struct ceph_mds_session *session,
1039 int extra)
1040{
1041 struct ceph_msg *msg;
1042 struct ceph_mds_cap_release *head;
1043 int err = -ENOMEM;
1044
1045 if (extra < 0)
1046 extra = mdsc->client->mount_args->cap_release_safety;
1047
1048 spin_lock(&session->s_cap_lock);
1049
1050 if (!list_empty(&session->s_cap_releases)) {
1051 msg = list_first_entry(&session->s_cap_releases,
1052 struct ceph_msg,
1053 list_head);
1054 head = msg->front.iov_base;
1055 extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1056 }
1057
1058 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1059 spin_unlock(&session->s_cap_lock);
1060 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1061 0, 0, NULL);
1062 if (!msg)
1063 goto out_unlocked;
1064 dout("add_cap_releases %p msg %p now %d\n", session, msg,
1065 (int)msg->front.iov_len);
1066 head = msg->front.iov_base;
1067 head->num = cpu_to_le32(0);
1068 msg->front.iov_len = sizeof(*head);
1069 spin_lock(&session->s_cap_lock);
1070 list_add(&msg->list_head, &session->s_cap_releases);
1071 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
1072 }
1073
1074 if (!list_empty(&session->s_cap_releases)) {
1075 msg = list_first_entry(&session->s_cap_releases,
1076 struct ceph_msg,
1077 list_head);
1078 head = msg->front.iov_base;
1079 if (head->num) {
1080 dout(" queueing non-full %p (%d)\n", msg,
1081 le32_to_cpu(head->num));
1082 list_move_tail(&msg->list_head,
1083 &session->s_cap_releases_done);
1084 session->s_num_cap_releases -=
1085 CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1086 }
1087 }
1088 err = 0;
1089 spin_unlock(&session->s_cap_lock);
1090out_unlocked:
1091 return err;
1092}
1093
1094/*
1095 * flush all dirty inode data to disk.
1096 *
1097 * returns true if we've flushed through want_flush_seq
1098 */
1099static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1100{
1101 int mds, ret = 1;
1102
1103 dout("check_cap_flush want %lld\n", want_flush_seq);
1104 mutex_lock(&mdsc->mutex);
1105 for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
1106 struct ceph_mds_session *session = mdsc->sessions[mds];
1107
1108 if (!session)
1109 continue;
1110 get_session(session);
1111 mutex_unlock(&mdsc->mutex);
1112
1113 mutex_lock(&session->s_mutex);
1114 if (!list_empty(&session->s_cap_flushing)) {
1115 struct ceph_inode_info *ci =
1116 list_entry(session->s_cap_flushing.next,
1117 struct ceph_inode_info,
1118 i_flushing_item);
1119 struct inode *inode = &ci->vfs_inode;
1120
1121 spin_lock(&inode->i_lock);
1122 if (ci->i_cap_flush_seq <= want_flush_seq) {
1123 dout("check_cap_flush still flushing %p "
1124 "seq %lld <= %lld to mds%d\n", inode,
1125 ci->i_cap_flush_seq, want_flush_seq,
1126 session->s_mds);
1127 ret = 0;
1128 }
1129 spin_unlock(&inode->i_lock);
1130 }
1131 mutex_unlock(&session->s_mutex);
1132 ceph_put_mds_session(session);
1133
1134 if (!ret)
1135 return ret;
1136 mutex_lock(&mdsc->mutex);
1137 }
1138
1139 mutex_unlock(&mdsc->mutex);
1140 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
1141 return ret;
1142}
1143
1144/*
1145 * called under s_mutex
1146 */
1147static void send_cap_releases(struct ceph_mds_client *mdsc,
1148 struct ceph_mds_session *session)
1149{
1150 struct ceph_msg *msg;
1151
1152 dout("send_cap_releases mds%d\n", session->s_mds);
1153 while (1) {
1154 spin_lock(&session->s_cap_lock);
1155 if (list_empty(&session->s_cap_releases_done))
1156 break;
1157 msg = list_first_entry(&session->s_cap_releases_done,
1158 struct ceph_msg, list_head);
1159 list_del_init(&msg->list_head);
1160 spin_unlock(&session->s_cap_lock);
1161 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1162 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1163 ceph_con_send(&session->s_con, msg);
1164 }
1165 spin_unlock(&session->s_cap_lock);
1166}
1167
1168/*
1169 * requests
1170 */
1171
1172/*
1173 * Create an mds request.
1174 */
1175struct ceph_mds_request *
1176ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1177{
1178 struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
1179
1180 if (!req)
1181 return ERR_PTR(-ENOMEM);
1182
1183 req->r_started = jiffies;
1184 req->r_resend_mds = -1;
1185 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
1186 req->r_fmode = -1;
1187 kref_init(&req->r_kref);
1188 INIT_LIST_HEAD(&req->r_wait);
1189 init_completion(&req->r_completion);
1190 init_completion(&req->r_safe_completion);
1191 INIT_LIST_HEAD(&req->r_unsafe_item);
1192
1193 req->r_op = op;
1194 req->r_direct_mode = mode;
1195 return req;
1196}
1197
1198/*
1199 * return oldest (lowest) request, tid in request tree, 0 if none.
1200 *
1201 * called under mdsc->mutex.
1202 */
1203static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
1204{
1205 if (RB_EMPTY_ROOT(&mdsc->request_tree))
1206 return NULL;
1207 return rb_entry(rb_first(&mdsc->request_tree),
1208 struct ceph_mds_request, r_node);
1209}
1210
1211static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
1212{
1213 struct ceph_mds_request *req = __get_oldest_req(mdsc);
1214
1215 if (req)
1216 return req->r_tid;
1217 return 0;
1218}
1219
1220/*
1221 * Build a dentry's path. Allocate on heap; caller must kfree. Based
1222 * on build_path_from_dentry in fs/cifs/dir.c.
1223 *
1224 * If @stop_on_nosnap, generate path relative to the first non-snapped
1225 * inode.
1226 *
1227 * Encode hidden .snap dirs as a double /, i.e.
1228 * foo/.snap/bar -> foo//bar
1229 */
1230char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
1231 int stop_on_nosnap)
1232{
1233 struct dentry *temp;
1234 char *path;
1235 int len, pos;
1236
1237 if (dentry == NULL)
1238 return ERR_PTR(-EINVAL);
1239
1240retry:
1241 len = 0;
1242 for (temp = dentry; !IS_ROOT(temp);) {
1243 struct inode *inode = temp->d_inode;
1244 if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
1245 len++; /* slash only */
1246 else if (stop_on_nosnap && inode &&
1247 ceph_snap(inode) == CEPH_NOSNAP)
1248 break;
1249 else
1250 len += 1 + temp->d_name.len;
1251 temp = temp->d_parent;
1252 if (temp == NULL) {
1253 pr_err("build_path_dentry corrupt dentry %p\n", dentry);
1254 return ERR_PTR(-EINVAL);
1255 }
1256 }
1257 if (len)
1258 len--; /* no leading '/' */
1259
1260 path = kmalloc(len+1, GFP_NOFS);
1261 if (path == NULL)
1262 return ERR_PTR(-ENOMEM);
1263 pos = len;
1264 path[pos] = 0; /* trailing null */
1265 for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
1266 struct inode *inode = temp->d_inode;
1267
1268 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1269 dout("build_path_dentry path+%d: %p SNAPDIR\n",
1270 pos, temp);
1271 } else if (stop_on_nosnap && inode &&
1272 ceph_snap(inode) == CEPH_NOSNAP) {
1273 break;
1274 } else {
1275 pos -= temp->d_name.len;
1276 if (pos < 0)
1277 break;
1278 strncpy(path + pos, temp->d_name.name,
1279 temp->d_name.len);
1280 dout("build_path_dentry path+%d: %p '%.*s'\n",
1281 pos, temp, temp->d_name.len, path + pos);
1282 }
1283 if (pos)
1284 path[--pos] = '/';
1285 temp = temp->d_parent;
1286 if (temp == NULL) {
1287 pr_err("build_path_dentry corrupt dentry\n");
1288 kfree(path);
1289 return ERR_PTR(-EINVAL);
1290 }
1291 }
1292 if (pos != 0) {
1293 pr_err("build_path_dentry did not end path lookup where "
1294 "expected, namelen is %d, pos is %d\n", len, pos);
1295 /* presumably this is only possible if racing with a
1296 rename of one of the parent directories (we can not
1297 lock the dentries above us to prevent this, but
1298 retrying should be harmless) */
1299 kfree(path);
1300 goto retry;
1301 }
1302
1303 *base = ceph_ino(temp->d_inode);
1304 *plen = len;
1305 dout("build_path_dentry on %p %d built %llx '%.*s'\n",
1306 dentry, atomic_read(&dentry->d_count), *base, len, path);
1307 return path;
1308}
1309
1310static int build_dentry_path(struct dentry *dentry,
1311 const char **ppath, int *ppathlen, u64 *pino,
1312 int *pfreepath)
1313{
1314 char *path;
1315
1316 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
1317 *pino = ceph_ino(dentry->d_parent->d_inode);
1318 *ppath = dentry->d_name.name;
1319 *ppathlen = dentry->d_name.len;
1320 return 0;
1321 }
1322 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1323 if (IS_ERR(path))
1324 return PTR_ERR(path);
1325 *ppath = path;
1326 *pfreepath = 1;
1327 return 0;
1328}
1329
1330static int build_inode_path(struct inode *inode,
1331 const char **ppath, int *ppathlen, u64 *pino,
1332 int *pfreepath)
1333{
1334 struct dentry *dentry;
1335 char *path;
1336
1337 if (ceph_snap(inode) == CEPH_NOSNAP) {
1338 *pino = ceph_ino(inode);
1339 *ppathlen = 0;
1340 return 0;
1341 }
1342 dentry = d_find_alias(inode);
1343 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1344 dput(dentry);
1345 if (IS_ERR(path))
1346 return PTR_ERR(path);
1347 *ppath = path;
1348 *pfreepath = 1;
1349 return 0;
1350}
1351
1352/*
1353 * request arguments may be specified via an inode *, a dentry *, or
1354 * an explicit ino+path.
1355 */
1356static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1357 const char *rpath, u64 rino,
1358 const char **ppath, int *pathlen,
1359 u64 *ino, int *freepath)
1360{
1361 int r = 0;
1362
1363 if (rinode) {
1364 r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
1365 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
1366 ceph_snap(rinode));
1367 } else if (rdentry) {
1368 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
1369 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
1370 *ppath);
1371 } else if (rpath) {
1372 *ino = rino;
1373 *ppath = rpath;
1374 *pathlen = strlen(rpath);
1375 dout(" path %.*s\n", *pathlen, rpath);
1376 }
1377
1378 return r;
1379}
1380
1381/*
1382 * called under mdsc->mutex
1383 */
1384static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1385 struct ceph_mds_request *req,
1386 int mds)
1387{
1388 struct ceph_msg *msg;
1389 struct ceph_mds_request_head *head;
1390 const char *path1 = NULL;
1391 const char *path2 = NULL;
1392 u64 ino1 = 0, ino2 = 0;
1393 int pathlen1 = 0, pathlen2 = 0;
1394 int freepath1 = 0, freepath2 = 0;
1395 int len;
1396 u16 releases;
1397 void *p, *end;
1398 int ret;
1399
1400 ret = set_request_path_attr(req->r_inode, req->r_dentry,
1401 req->r_path1, req->r_ino1.ino,
1402 &path1, &pathlen1, &ino1, &freepath1);
1403 if (ret < 0) {
1404 msg = ERR_PTR(ret);
1405 goto out;
1406 }
1407
1408 ret = set_request_path_attr(NULL, req->r_old_dentry,
1409 req->r_path2, req->r_ino2.ino,
1410 &path2, &pathlen2, &ino2, &freepath2);
1411 if (ret < 0) {
1412 msg = ERR_PTR(ret);
1413 goto out_free1;
1414 }
1415
1416 len = sizeof(*head) +
1417 pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
1418
1419 /* calculate (max) length for cap releases */
1420 len += sizeof(struct ceph_mds_request_release) *
1421 (!!req->r_inode_drop + !!req->r_dentry_drop +
1422 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
1423 if (req->r_dentry_drop)
1424 len += req->r_dentry->d_name.len;
1425 if (req->r_old_dentry_drop)
1426 len += req->r_old_dentry->d_name.len;
1427
1428 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
1429 if (IS_ERR(msg))
1430 goto out_free2;
1431
1432 msg->hdr.tid = cpu_to_le64(req->r_tid);
1433
1434 head = msg->front.iov_base;
1435 p = msg->front.iov_base + sizeof(*head);
1436 end = msg->front.iov_base + msg->front.iov_len;
1437
1438 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
1439 head->op = cpu_to_le32(req->r_op);
1440 head->caller_uid = cpu_to_le32(current_fsuid());
1441 head->caller_gid = cpu_to_le32(current_fsgid());
1442 head->args = req->r_args;
1443
1444 ceph_encode_filepath(&p, end, ino1, path1);
1445 ceph_encode_filepath(&p, end, ino2, path2);
1446
1447 /* cap releases */
1448 releases = 0;
1449 if (req->r_inode_drop)
1450 releases += ceph_encode_inode_release(&p,
1451 req->r_inode ? req->r_inode : req->r_dentry->d_inode,
1452 mds, req->r_inode_drop, req->r_inode_unless, 0);
1453 if (req->r_dentry_drop)
1454 releases += ceph_encode_dentry_release(&p, req->r_dentry,
1455 mds, req->r_dentry_drop, req->r_dentry_unless);
1456 if (req->r_old_dentry_drop)
1457 releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
1458 mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
1459 if (req->r_old_inode_drop)
1460 releases += ceph_encode_inode_release(&p,
1461 req->r_old_dentry->d_inode,
1462 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
1463 head->num_releases = cpu_to_le16(releases);
1464
1465 BUG_ON(p > end);
1466 msg->front.iov_len = p - msg->front.iov_base;
1467 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1468
1469 msg->pages = req->r_pages;
1470 msg->nr_pages = req->r_num_pages;
1471 msg->hdr.data_len = cpu_to_le32(req->r_data_len);
1472 msg->hdr.data_off = cpu_to_le16(0);
1473
1474out_free2:
1475 if (freepath2)
1476 kfree((char *)path2);
1477out_free1:
1478 if (freepath1)
1479 kfree((char *)path1);
1480out:
1481 return msg;
1482}
1483
1484/*
1485 * called under mdsc->mutex if error, under no mutex if
1486 * success.
1487 */
1488static void complete_request(struct ceph_mds_client *mdsc,
1489 struct ceph_mds_request *req)
1490{
1491 if (req->r_callback)
1492 req->r_callback(mdsc, req);
1493 else
1494 complete(&req->r_completion);
1495}
1496
1497/*
1498 * called under mdsc->mutex
1499 */
1500static int __prepare_send_request(struct ceph_mds_client *mdsc,
1501 struct ceph_mds_request *req,
1502 int mds)
1503{
1504 struct ceph_mds_request_head *rhead;
1505 struct ceph_msg *msg;
1506 int flags = 0;
1507
1508 req->r_mds = mds;
1509 req->r_attempts++;
1510 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
1511 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1512
1513 if (req->r_request) {
1514 ceph_msg_put(req->r_request);
1515 req->r_request = NULL;
1516 }
1517 msg = create_request_message(mdsc, req, mds);
1518 if (IS_ERR(msg)) {
1519 req->r_reply = ERR_PTR(PTR_ERR(msg));
1520 complete_request(mdsc, req);
1521 return -PTR_ERR(msg);
1522 }
1523 req->r_request = msg;
1524
1525 rhead = msg->front.iov_base;
1526 rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
1527 if (req->r_got_unsafe)
1528 flags |= CEPH_MDS_FLAG_REPLAY;
1529 if (req->r_locked_dir)
1530 flags |= CEPH_MDS_FLAG_WANT_DENTRY;
1531 rhead->flags = cpu_to_le32(flags);
1532 rhead->num_fwd = req->r_num_fwd;
1533 rhead->num_retry = req->r_attempts - 1;
1534
1535 dout(" r_locked_dir = %p\n", req->r_locked_dir);
1536
1537 if (req->r_target_inode && req->r_got_unsafe)
1538 rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
1539 else
1540 rhead->ino = 0;
1541 return 0;
1542}
1543
1544/*
1545 * send request, or put it on the appropriate wait list.
1546 */
1547static int __do_request(struct ceph_mds_client *mdsc,
1548 struct ceph_mds_request *req)
1549{
1550 struct ceph_mds_session *session = NULL;
1551 int mds = -1;
1552 int err = -EAGAIN;
1553
1554 if (req->r_reply)
1555 goto out;
1556
1557 if (req->r_timeout &&
1558 time_after_eq(jiffies, req->r_started + req->r_timeout)) {
1559 dout("do_request timed out\n");
1560 err = -EIO;
1561 goto finish;
1562 }
1563
1564 mds = __choose_mds(mdsc, req);
1565 if (mds < 0 ||
1566 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
1567 dout("do_request no mds or not active, waiting for map\n");
1568 list_add(&req->r_wait, &mdsc->waiting_for_map);
1569 goto out;
1570 }
1571
1572 /* get, open session */
1573 session = __ceph_lookup_mds_session(mdsc, mds);
1574 if (!session) {
1575 session = register_session(mdsc, mds);
1576 if (IS_ERR(session)) {
1577 err = PTR_ERR(session);
1578 goto finish;
1579 }
1580 }
1581 dout("do_request mds%d session %p state %s\n", mds, session,
1582 session_state_name(session->s_state));
1583 if (session->s_state != CEPH_MDS_SESSION_OPEN &&
1584 session->s_state != CEPH_MDS_SESSION_HUNG) {
1585 if (session->s_state == CEPH_MDS_SESSION_NEW ||
1586 session->s_state == CEPH_MDS_SESSION_CLOSING)
1587 __open_session(mdsc, session);
1588 list_add(&req->r_wait, &session->s_waiting);
1589 goto out_session;
1590 }
1591
1592 /* send request */
1593 req->r_session = get_session(session);
1594 req->r_resend_mds = -1; /* forget any previous mds hint */
1595
1596 if (req->r_request_started == 0) /* note request start time */
1597 req->r_request_started = jiffies;
1598
1599 err = __prepare_send_request(mdsc, req, mds);
1600 if (!err) {
1601 ceph_msg_get(req->r_request);
1602 ceph_con_send(&session->s_con, req->r_request);
1603 }
1604
1605out_session:
1606 ceph_put_mds_session(session);
1607out:
1608 return err;
1609
1610finish:
1611 req->r_reply = ERR_PTR(err);
1612 complete_request(mdsc, req);
1613 goto out;
1614}
1615
1616/*
1617 * called under mdsc->mutex
1618 */
1619static void __wake_requests(struct ceph_mds_client *mdsc,
1620 struct list_head *head)
1621{
1622 struct ceph_mds_request *req, *nreq;
1623
1624 list_for_each_entry_safe(req, nreq, head, r_wait) {
1625 list_del_init(&req->r_wait);
1626 __do_request(mdsc, req);
1627 }
1628}
1629
1630/*
1631 * Wake up threads with requests pending for @mds, so that they can
1632 * resubmit their requests to a possibly different mds. If @all is set,
1633 * wake up if their requests has been forwarded to @mds, too.
1634 */
1635static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
1636{
1637 struct ceph_mds_request *req;
1638 struct rb_node *p;
1639
1640 dout("kick_requests mds%d\n", mds);
1641 for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
1642 req = rb_entry(p, struct ceph_mds_request, r_node);
1643 if (req->r_got_unsafe)
1644 continue;
1645 if (req->r_session &&
1646 req->r_session->s_mds == mds) {
1647 dout(" kicking tid %llu\n", req->r_tid);
1648 put_request_session(req);
1649 __do_request(mdsc, req);
1650 }
1651 }
1652}
1653
1654void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
1655 struct ceph_mds_request *req)
1656{
1657 dout("submit_request on %p\n", req);
1658 mutex_lock(&mdsc->mutex);
1659 __register_request(mdsc, req, NULL);
1660 __do_request(mdsc, req);
1661 mutex_unlock(&mdsc->mutex);
1662}
1663
1664/*
1665 * Synchrously perform an mds request. Take care of all of the
1666 * session setup, forwarding, retry details.
1667 */
1668int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1669 struct inode *dir,
1670 struct ceph_mds_request *req)
1671{
1672 int err;
1673
1674 dout("do_request on %p\n", req);
1675
1676 /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
1677 if (req->r_inode)
1678 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
1679 if (req->r_locked_dir)
1680 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
1681 if (req->r_old_dentry)
1682 ceph_get_cap_refs(
1683 ceph_inode(req->r_old_dentry->d_parent->d_inode),
1684 CEPH_CAP_PIN);
1685
1686 /* issue */
1687 mutex_lock(&mdsc->mutex);
1688 __register_request(mdsc, req, dir);
1689 __do_request(mdsc, req);
1690
1691 /* wait */
1692 if (!req->r_reply) {
1693 mutex_unlock(&mdsc->mutex);
1694 if (req->r_timeout) {
1695 err = (long)wait_for_completion_interruptible_timeout(
1696 &req->r_completion, req->r_timeout);
1697 if (err == 0)
1698 req->r_reply = ERR_PTR(-EIO);
1699 else if (err < 0)
1700 req->r_reply = ERR_PTR(err);
1701 } else {
1702 err = wait_for_completion_interruptible(
1703 &req->r_completion);
1704 if (err)
1705 req->r_reply = ERR_PTR(err);
1706 }
1707 mutex_lock(&mdsc->mutex);
1708 }
1709
1710 if (IS_ERR(req->r_reply)) {
1711 err = PTR_ERR(req->r_reply);
1712 req->r_reply = NULL;
1713
1714 if (err == -ERESTARTSYS) {
1715 /* aborted */
1716 req->r_aborted = true;
1717
1718 if (req->r_locked_dir &&
1719 (req->r_op & CEPH_MDS_OP_WRITE)) {
1720 struct ceph_inode_info *ci =
1721 ceph_inode(req->r_locked_dir);
1722
1723 dout("aborted, clearing I_COMPLETE on %p\n",
1724 req->r_locked_dir);
1725 spin_lock(&req->r_locked_dir->i_lock);
1726 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1727 ci->i_release_count++;
1728 spin_unlock(&req->r_locked_dir->i_lock);
1729 }
1730 } else {
1731 /* clean up this request */
1732 __unregister_request(mdsc, req);
1733 if (!list_empty(&req->r_unsafe_item))
1734 list_del_init(&req->r_unsafe_item);
1735 complete(&req->r_safe_completion);
1736 }
1737 } else if (req->r_err) {
1738 err = req->r_err;
1739 } else {
1740 err = le32_to_cpu(req->r_reply_info.head->result);
1741 }
1742 mutex_unlock(&mdsc->mutex);
1743
1744 dout("do_request %p done, result %d\n", req, err);
1745 return err;
1746}
1747
1748/*
1749 * Handle mds reply.
1750 *
1751 * We take the session mutex and parse and process the reply immediately.
1752 * This preserves the logical ordering of replies, capabilities, etc., sent
1753 * by the MDS as they are applied to our local cache.
1754 */
1755static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1756{
1757 struct ceph_mds_client *mdsc = session->s_mdsc;
1758 struct ceph_mds_request *req;
1759 struct ceph_mds_reply_head *head = msg->front.iov_base;
1760 struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
1761 u64 tid;
1762 int err, result;
1763 int mds = session->s_mds;
1764
1765 if (msg->front.iov_len < sizeof(*head)) {
1766 pr_err("mdsc_handle_reply got corrupt (short) reply\n");
1767 ceph_msg_dump(msg);
1768 return;
1769 }
1770
1771 /* get request, session */
1772 tid = le64_to_cpu(msg->hdr.tid);
1773 mutex_lock(&mdsc->mutex);
1774 req = __lookup_request(mdsc, tid);
1775 if (!req) {
1776 dout("handle_reply on unknown tid %llu\n", tid);
1777 mutex_unlock(&mdsc->mutex);
1778 return;
1779 }
1780 dout("handle_reply %p\n", req);
1781
1782 /* correct session? */
1783 if (req->r_session != session) {
1784 pr_err("mdsc_handle_reply got %llu on session mds%d"
1785 " not mds%d\n", tid, session->s_mds,
1786 req->r_session ? req->r_session->s_mds : -1);
1787 mutex_unlock(&mdsc->mutex);
1788 goto out;
1789 }
1790
1791 /* dup? */
1792 if ((req->r_got_unsafe && !head->safe) ||
1793 (req->r_got_safe && head->safe)) {
1794 pr_warning("got a dup %s reply on %llu from mds%d\n",
1795 head->safe ? "safe" : "unsafe", tid, mds);
1796 mutex_unlock(&mdsc->mutex);
1797 goto out;
1798 }
1799
1800 result = le32_to_cpu(head->result);
1801
1802 /*
1803 * Tolerate 2 consecutive ESTALEs from the same mds.
1804 * FIXME: we should be looking at the cap migrate_seq.
1805 */
1806 if (result == -ESTALE) {
1807 req->r_direct_mode = USE_AUTH_MDS;
1808 req->r_num_stale++;
1809 if (req->r_num_stale <= 2) {
1810 __do_request(mdsc, req);
1811 mutex_unlock(&mdsc->mutex);
1812 goto out;
1813 }
1814 } else {
1815 req->r_num_stale = 0;
1816 }
1817
1818 if (head->safe) {
1819 req->r_got_safe = true;
1820 __unregister_request(mdsc, req);
1821 complete(&req->r_safe_completion);
1822
1823 if (req->r_got_unsafe) {
1824 /*
1825 * We already handled the unsafe response, now do the
1826 * cleanup. No need to examine the response; the MDS
1827 * doesn't include any result info in the safe
1828 * response. And even if it did, there is nothing
1829 * useful we could do with a revised return value.
1830 */
1831 dout("got safe reply %llu, mds%d\n", tid, mds);
1832 list_del_init(&req->r_unsafe_item);
1833
1834 /* last unsafe request during umount? */
1835 if (mdsc->stopping && !__get_oldest_req(mdsc))
1836 complete(&mdsc->safe_umount_waiters);
1837 mutex_unlock(&mdsc->mutex);
1838 goto out;
1839 }
1840 }
1841
1842 BUG_ON(req->r_reply);
1843
1844 if (!head->safe) {
1845 req->r_got_unsafe = true;
1846 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
1847 }
1848
1849 dout("handle_reply tid %lld result %d\n", tid, result);
1850 rinfo = &req->r_reply_info;
1851 err = parse_reply_info(msg, rinfo);
1852 mutex_unlock(&mdsc->mutex);
1853
1854 mutex_lock(&session->s_mutex);
1855 if (err < 0) {
1856 pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
1857 ceph_msg_dump(msg);
1858 goto out_err;
1859 }
1860
1861 /* snap trace */
1862 if (rinfo->snapblob_len) {
1863 down_write(&mdsc->snap_rwsem);
1864 ceph_update_snap_trace(mdsc, rinfo->snapblob,
1865 rinfo->snapblob + rinfo->snapblob_len,
1866 le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
1867 downgrade_write(&mdsc->snap_rwsem);
1868 } else {
1869 down_read(&mdsc->snap_rwsem);
1870 }
1871
1872 /* insert trace into our cache */
1873 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
1874 if (err == 0) {
1875 if (result == 0 && rinfo->dir_nr)
1876 ceph_readdir_prepopulate(req, req->r_session);
1877 ceph_unreserve_caps(&req->r_caps_reservation);
1878 }
1879
1880 up_read(&mdsc->snap_rwsem);
1881out_err:
1882 if (err) {
1883 req->r_err = err;
1884 } else {
1885 req->r_reply = msg;
1886 ceph_msg_get(msg);
1887 }
1888
1889 add_cap_releases(mdsc, req->r_session, -1);
1890 mutex_unlock(&session->s_mutex);
1891
1892 /* kick calling process */
1893 complete_request(mdsc, req);
1894out:
1895 ceph_mdsc_put_request(req);
1896 return;
1897}
1898
1899
1900
1901/*
1902 * handle mds notification that our request has been forwarded.
1903 */
1904static void handle_forward(struct ceph_mds_client *mdsc,
1905 struct ceph_mds_session *session,
1906 struct ceph_msg *msg)
1907{
1908 struct ceph_mds_request *req;
1909 u64 tid = le64_to_cpu(msg->hdr.tid);
1910 u32 next_mds;
1911 u32 fwd_seq;
1912 int err = -EINVAL;
1913 void *p = msg->front.iov_base;
1914 void *end = p + msg->front.iov_len;
1915
1916 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1917 next_mds = ceph_decode_32(&p);
1918 fwd_seq = ceph_decode_32(&p);
1919
1920 mutex_lock(&mdsc->mutex);
1921 req = __lookup_request(mdsc, tid);
1922 if (!req) {
1923 dout("forward %llu to mds%d - req dne\n", tid, next_mds);
1924 goto out; /* dup reply? */
1925 }
1926
1927 if (fwd_seq <= req->r_num_fwd) {
1928 dout("forward %llu to mds%d - old seq %d <= %d\n",
1929 tid, next_mds, req->r_num_fwd, fwd_seq);
1930 } else {
1931 /* resend. forward race not possible; mds would drop */
1932 dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
1933 req->r_num_fwd = fwd_seq;
1934 req->r_resend_mds = next_mds;
1935 put_request_session(req);
1936 __do_request(mdsc, req);
1937 }
1938 ceph_mdsc_put_request(req);
1939out:
1940 mutex_unlock(&mdsc->mutex);
1941 return;
1942
1943bad:
1944 pr_err("mdsc_handle_forward decode error err=%d\n", err);
1945}
1946
1947/*
1948 * handle a mds session control message
1949 */
1950static void handle_session(struct ceph_mds_session *session,
1951 struct ceph_msg *msg)
1952{
1953 struct ceph_mds_client *mdsc = session->s_mdsc;
1954 u32 op;
1955 u64 seq;
1956 int mds = session->s_mds;
1957 struct ceph_mds_session_head *h = msg->front.iov_base;
1958 int wake = 0;
1959
1960 /* decode */
1961 if (msg->front.iov_len != sizeof(*h))
1962 goto bad;
1963 op = le32_to_cpu(h->op);
1964 seq = le64_to_cpu(h->seq);
1965
1966 mutex_lock(&mdsc->mutex);
1967 if (op == CEPH_SESSION_CLOSE)
1968 __unregister_session(mdsc, session);
1969 /* FIXME: this ttl calculation is generous */
1970 session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
1971 mutex_unlock(&mdsc->mutex);
1972
1973 mutex_lock(&session->s_mutex);
1974
1975 dout("handle_session mds%d %s %p state %s seq %llu\n",
1976 mds, ceph_session_op_name(op), session,
1977 session_state_name(session->s_state), seq);
1978
1979 if (session->s_state == CEPH_MDS_SESSION_HUNG) {
1980 session->s_state = CEPH_MDS_SESSION_OPEN;
1981 pr_info("mds%d came back\n", session->s_mds);
1982 }
1983
1984 switch (op) {
1985 case CEPH_SESSION_OPEN:
1986 session->s_state = CEPH_MDS_SESSION_OPEN;
1987 renewed_caps(mdsc, session, 0);
1988 wake = 1;
1989 if (mdsc->stopping)
1990 __close_session(mdsc, session);
1991 break;
1992
1993 case CEPH_SESSION_RENEWCAPS:
1994 if (session->s_renew_seq == seq)
1995 renewed_caps(mdsc, session, 1);
1996 break;
1997
1998 case CEPH_SESSION_CLOSE:
1999 remove_session_caps(session);
2000 wake = 1; /* for good measure */
2001 complete(&mdsc->session_close_waiters);
2002 kick_requests(mdsc, mds, 0); /* cur only */
2003 break;
2004
2005 case CEPH_SESSION_STALE:
2006 pr_info("mds%d caps went stale, renewing\n",
2007 session->s_mds);
2008 spin_lock(&session->s_cap_lock);
2009 session->s_cap_gen++;
2010 session->s_cap_ttl = 0;
2011 spin_unlock(&session->s_cap_lock);
2012 send_renew_caps(mdsc, session);
2013 break;
2014
2015 case CEPH_SESSION_RECALL_STATE:
2016 trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
2017 break;
2018
2019 default:
2020 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
2021 WARN_ON(1);
2022 }
2023
2024 mutex_unlock(&session->s_mutex);
2025 if (wake) {
2026 mutex_lock(&mdsc->mutex);
2027 __wake_requests(mdsc, &session->s_waiting);
2028 mutex_unlock(&mdsc->mutex);
2029 }
2030 return;
2031
2032bad:
2033 pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
2034 (int)msg->front.iov_len);
2035 ceph_msg_dump(msg);
2036 return;
2037}
2038
2039
2040/*
2041 * called under session->mutex.
2042 */
2043static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2044 struct ceph_mds_session *session)
2045{
2046 struct ceph_mds_request *req, *nreq;
2047 int err;
2048
2049 dout("replay_unsafe_requests mds%d\n", session->s_mds);
2050
2051 mutex_lock(&mdsc->mutex);
2052 list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
2053 err = __prepare_send_request(mdsc, req, session->s_mds);
2054 if (!err) {
2055 ceph_msg_get(req->r_request);
2056 ceph_con_send(&session->s_con, req->r_request);
2057 }
2058 }
2059 mutex_unlock(&mdsc->mutex);
2060}
2061
2062/*
2063 * Encode information about a cap for a reconnect with the MDS.
2064 */
2065static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2066 void *arg)
2067{
2068 struct ceph_mds_cap_reconnect rec;
2069 struct ceph_inode_info *ci;
2070 struct ceph_pagelist *pagelist = arg;
2071 char *path;
2072 int pathlen, err;
2073 u64 pathbase;
2074 struct dentry *dentry;
2075
2076 ci = cap->ci;
2077
2078 dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
2079 inode, ceph_vinop(inode), cap, cap->cap_id,
2080 ceph_cap_string(cap->issued));
2081 err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
2082 if (err)
2083 return err;
2084
2085 dentry = d_find_alias(inode);
2086 if (dentry) {
2087 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
2088 if (IS_ERR(path)) {
2089 err = PTR_ERR(path);
2090 BUG_ON(err);
2091 }
2092 } else {
2093 path = NULL;
2094 pathlen = 0;
2095 }
2096 err = ceph_pagelist_encode_string(pagelist, path, pathlen);
2097 if (err)
2098 goto out;
2099
2100 spin_lock(&inode->i_lock);
2101 cap->seq = 0; /* reset cap seq */
2102 cap->issue_seq = 0; /* and issue_seq */
2103 rec.cap_id = cpu_to_le64(cap->cap_id);
2104 rec.pathbase = cpu_to_le64(pathbase);
2105 rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2106 rec.issued = cpu_to_le32(cap->issued);
2107 rec.size = cpu_to_le64(inode->i_size);
2108 ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
2109 ceph_encode_timespec(&rec.atime, &inode->i_atime);
2110 rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2111 spin_unlock(&inode->i_lock);
2112
2113 err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
2114
2115out:
2116 kfree(path);
2117 dput(dentry);
2118 return err;
2119}
2120
2121
2122/*
2123 * If an MDS fails and recovers, clients need to reconnect in order to
2124 * reestablish shared state. This includes all caps issued through
2125 * this session _and_ the snap_realm hierarchy. Because it's not
2126 * clear which snap realms the mds cares about, we send everything we
2127 * know about.. that ensures we'll then get any new info the
2128 * recovering MDS might have.
2129 *
2130 * This is a relatively heavyweight operation, but it's rare.
2131 *
2132 * called with mdsc->mutex held.
2133 */
2134static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2135{
2136 struct ceph_mds_session *session = NULL;
2137 struct ceph_msg *reply;
2138 struct rb_node *p;
2139 int err;
2140 struct ceph_pagelist *pagelist;
2141
2142 pr_info("reconnect to recovering mds%d\n", mds);
2143
2144 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2145 if (!pagelist)
2146 goto fail_nopagelist;
2147 ceph_pagelist_init(pagelist);
2148
2149 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
2150 if (IS_ERR(reply)) {
2151 err = PTR_ERR(reply);
2152 goto fail_nomsg;
2153 }
2154
2155 /* find session */
2156 session = __ceph_lookup_mds_session(mdsc, mds);
2157 mutex_unlock(&mdsc->mutex); /* drop lock for duration */
2158
2159 if (session) {
2160 mutex_lock(&session->s_mutex);
2161
2162 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2163 session->s_seq = 0;
2164
2165 ceph_con_open(&session->s_con,
2166 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2167
2168 /* replay unsafe requests */
2169 replay_unsafe_requests(mdsc, session);
2170 } else {
2171 dout("no session for mds%d, will send short reconnect\n",
2172 mds);
2173 }
2174
2175 down_read(&mdsc->snap_rwsem);
2176
2177 if (!session)
2178 goto send;
2179 dout("session %p state %s\n", session,
2180 session_state_name(session->s_state));
2181
2182 /* traverse this session's caps */
2183 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2184 if (err)
2185 goto fail;
2186 err = iterate_session_caps(session, encode_caps_cb, pagelist);
2187 if (err < 0)
2188 goto out;
2189
2190 /*
2191 * snaprealms. we provide mds with the ino, seq (version), and
2192 * parent for all of our realms. If the mds has any newer info,
2193 * it will tell us.
2194 */
2195 for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
2196 struct ceph_snap_realm *realm =
2197 rb_entry(p, struct ceph_snap_realm, node);
2198 struct ceph_mds_snaprealm_reconnect sr_rec;
2199
2200 dout(" adding snap realm %llx seq %lld parent %llx\n",
2201 realm->ino, realm->seq, realm->parent_ino);
2202 sr_rec.ino = cpu_to_le64(realm->ino);
2203 sr_rec.seq = cpu_to_le64(realm->seq);
2204 sr_rec.parent = cpu_to_le64(realm->parent_ino);
2205 err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
2206 if (err)
2207 goto fail;
2208 }
2209
2210send:
2211 reply->pagelist = pagelist;
2212 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2213 reply->nr_pages = calc_pages_for(0, pagelist->length);
2214 ceph_con_send(&session->s_con, reply);
2215
2216 if (session) {
2217 session->s_state = CEPH_MDS_SESSION_OPEN;
2218 __wake_requests(mdsc, &session->s_waiting);
2219 }
2220
2221out:
2222 up_read(&mdsc->snap_rwsem);
2223 if (session) {
2224 mutex_unlock(&session->s_mutex);
2225 ceph_put_mds_session(session);
2226 }
2227 mutex_lock(&mdsc->mutex);
2228 return;
2229
2230fail:
2231 ceph_msg_put(reply);
2232fail_nomsg:
2233 ceph_pagelist_release(pagelist);
2234 kfree(pagelist);
2235fail_nopagelist:
2236 pr_err("ENOMEM preparing reconnect for mds%d\n", mds);
2237 goto out;
2238}
2239
2240
2241/*
2242 * compare old and new mdsmaps, kicking requests
2243 * and closing out old connections as necessary
2244 *
2245 * called under mdsc->mutex.
2246 */
2247static void check_new_map(struct ceph_mds_client *mdsc,
2248 struct ceph_mdsmap *newmap,
2249 struct ceph_mdsmap *oldmap)
2250{
2251 int i;
2252 int oldstate, newstate;
2253 struct ceph_mds_session *s;
2254
2255 dout("check_new_map new %u old %u\n",
2256 newmap->m_epoch, oldmap->m_epoch);
2257
2258 for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
2259 if (mdsc->sessions[i] == NULL)
2260 continue;
2261 s = mdsc->sessions[i];
2262 oldstate = ceph_mdsmap_get_state(oldmap, i);
2263 newstate = ceph_mdsmap_get_state(newmap, i);
2264
2265 dout("check_new_map mds%d state %s -> %s (session %s)\n",
2266 i, ceph_mds_state_name(oldstate),
2267 ceph_mds_state_name(newstate),
2268 session_state_name(s->s_state));
2269
2270 if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
2271 ceph_mdsmap_get_addr(newmap, i),
2272 sizeof(struct ceph_entity_addr))) {
2273 if (s->s_state == CEPH_MDS_SESSION_OPENING) {
2274 /* the session never opened, just close it
2275 * out now */
2276 __wake_requests(mdsc, &s->s_waiting);
2277 __unregister_session(mdsc, s);
2278 } else {
2279 /* just close it */
2280 mutex_unlock(&mdsc->mutex);
2281 mutex_lock(&s->s_mutex);
2282 mutex_lock(&mdsc->mutex);
2283 ceph_con_close(&s->s_con);
2284 mutex_unlock(&s->s_mutex);
2285 s->s_state = CEPH_MDS_SESSION_RESTARTING;
2286 }
2287
2288 /* kick any requests waiting on the recovering mds */
2289 kick_requests(mdsc, i, 1);
2290 } else if (oldstate == newstate) {
2291 continue; /* nothing new with this mds */
2292 }
2293
2294 /*
2295 * send reconnect?
2296 */
2297 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2298 newstate >= CEPH_MDS_STATE_RECONNECT)
2299 send_mds_reconnect(mdsc, i);
2300
2301 /*
2302 * kick requests on any mds that has gone active.
2303 *
2304 * kick requests on cur or forwarder: we may have sent
2305 * the request to mds1, mds1 told us it forwarded it
2306 * to mds2, but then we learn mds1 failed and can't be
2307 * sure it successfully forwarded our request before
2308 * it died.
2309 */
2310 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2311 newstate >= CEPH_MDS_STATE_ACTIVE) {
2312 pr_info("mds%d reconnect completed\n", s->s_mds);
2313 kick_requests(mdsc, i, 1);
2314 ceph_kick_flushing_caps(mdsc, s);
2315 wake_up_session_caps(s, 1);
2316 }
2317 }
2318}
2319
2320
2321
2322/*
2323 * leases
2324 */
2325
2326/*
2327 * caller must hold session s_mutex, dentry->d_lock
2328 */
2329void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
2330{
2331 struct ceph_dentry_info *di = ceph_dentry(dentry);
2332
2333 ceph_put_mds_session(di->lease_session);
2334 di->lease_session = NULL;
2335}
2336
2337static void handle_lease(struct ceph_mds_client *mdsc,
2338 struct ceph_mds_session *session,
2339 struct ceph_msg *msg)
2340{
2341 struct super_block *sb = mdsc->client->sb;
2342 struct inode *inode;
2343 struct ceph_inode_info *ci;
2344 struct dentry *parent, *dentry;
2345 struct ceph_dentry_info *di;
2346 int mds = session->s_mds;
2347 struct ceph_mds_lease *h = msg->front.iov_base;
2348 struct ceph_vino vino;
2349 int mask;
2350 struct qstr dname;
2351 int release = 0;
2352
2353 dout("handle_lease from mds%d\n", mds);
2354
2355 /* decode */
2356 if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
2357 goto bad;
2358 vino.ino = le64_to_cpu(h->ino);
2359 vino.snap = CEPH_NOSNAP;
2360 mask = le16_to_cpu(h->mask);
2361 dname.name = (void *)h + sizeof(*h) + sizeof(u32);
2362 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
2363 if (dname.len != get_unaligned_le32(h+1))
2364 goto bad;
2365
2366 mutex_lock(&session->s_mutex);
2367 session->s_seq++;
2368
2369 /* lookup inode */
2370 inode = ceph_find_inode(sb, vino);
2371 dout("handle_lease '%s', mask %d, ino %llx %p\n",
2372 ceph_lease_op_name(h->action), mask, vino.ino, inode);
2373 if (inode == NULL) {
2374 dout("handle_lease no inode %llx\n", vino.ino);
2375 goto release;
2376 }
2377 ci = ceph_inode(inode);
2378
2379 /* dentry */
2380 parent = d_find_alias(inode);
2381 if (!parent) {
2382 dout("no parent dentry on inode %p\n", inode);
2383 WARN_ON(1);
2384 goto release; /* hrm... */
2385 }
2386 dname.hash = full_name_hash(dname.name, dname.len);
2387 dentry = d_lookup(parent, &dname);
2388 dput(parent);
2389 if (!dentry)
2390 goto release;
2391
2392 spin_lock(&dentry->d_lock);
2393 di = ceph_dentry(dentry);
2394 switch (h->action) {
2395 case CEPH_MDS_LEASE_REVOKE:
2396 if (di && di->lease_session == session) {
2397 h->seq = cpu_to_le32(di->lease_seq);
2398 __ceph_mdsc_drop_dentry_lease(dentry);
2399 }
2400 release = 1;
2401 break;
2402
2403 case CEPH_MDS_LEASE_RENEW:
2404 if (di && di->lease_session == session &&
2405 di->lease_gen == session->s_cap_gen &&
2406 di->lease_renew_from &&
2407 di->lease_renew_after == 0) {
2408 unsigned long duration =
2409 le32_to_cpu(h->duration_ms) * HZ / 1000;
2410
2411 di->lease_seq = le32_to_cpu(h->seq);
2412 dentry->d_time = di->lease_renew_from + duration;
2413 di->lease_renew_after = di->lease_renew_from +
2414 (duration >> 1);
2415 di->lease_renew_from = 0;
2416 }
2417 break;
2418 }
2419 spin_unlock(&dentry->d_lock);
2420 dput(dentry);
2421
2422 if (!release)
2423 goto out;
2424
2425release:
2426 /* let's just reuse the same message */
2427 h->action = CEPH_MDS_LEASE_REVOKE_ACK;
2428 ceph_msg_get(msg);
2429 ceph_con_send(&session->s_con, msg);
2430
2431out:
2432 iput(inode);
2433 mutex_unlock(&session->s_mutex);
2434 return;
2435
2436bad:
2437 pr_err("corrupt lease message\n");
2438 ceph_msg_dump(msg);
2439}
2440
2441void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2442 struct inode *inode,
2443 struct dentry *dentry, char action,
2444 u32 seq)
2445{
2446 struct ceph_msg *msg;
2447 struct ceph_mds_lease *lease;
2448 int len = sizeof(*lease) + sizeof(u32);
2449 int dnamelen = 0;
2450
2451 dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
2452 inode, dentry, ceph_lease_op_name(action), session->s_mds);
2453 dnamelen = dentry->d_name.len;
2454 len += dnamelen;
2455
2456 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
2457 if (IS_ERR(msg))
2458 return;
2459 lease = msg->front.iov_base;
2460 lease->action = action;
2461 lease->mask = cpu_to_le16(CEPH_LOCK_DN);
2462 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2463 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2464 lease->seq = cpu_to_le32(seq);
2465 put_unaligned_le32(dnamelen, lease + 1);
2466 memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
2467
2468 /*
2469 * if this is a preemptive lease RELEASE, no need to
2470 * flush request stream, since the actual request will
2471 * soon follow.
2472 */
2473 msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
2474
2475 ceph_con_send(&session->s_con, msg);
2476}
2477
2478/*
2479 * Preemptively release a lease we expect to invalidate anyway.
2480 * Pass @inode always, @dentry is optional.
2481 */
2482void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2483 struct dentry *dentry, int mask)
2484{
2485 struct ceph_dentry_info *di;
2486 struct ceph_mds_session *session;
2487 u32 seq;
2488
2489 BUG_ON(inode == NULL);
2490 BUG_ON(dentry == NULL);
2491 BUG_ON(mask != CEPH_LOCK_DN);
2492
2493 /* is dentry lease valid? */
2494 spin_lock(&dentry->d_lock);
2495 di = ceph_dentry(dentry);
2496 if (!di || !di->lease_session ||
2497 di->lease_session->s_mds < 0 ||
2498 di->lease_gen != di->lease_session->s_cap_gen ||
2499 !time_before(jiffies, dentry->d_time)) {
2500 dout("lease_release inode %p dentry %p -- "
2501 "no lease on %d\n",
2502 inode, dentry, mask);
2503 spin_unlock(&dentry->d_lock);
2504 return;
2505 }
2506
2507 /* we do have a lease on this dentry; note mds and seq */
2508 session = ceph_get_mds_session(di->lease_session);
2509 seq = di->lease_seq;
2510 __ceph_mdsc_drop_dentry_lease(dentry);
2511 spin_unlock(&dentry->d_lock);
2512
2513 dout("lease_release inode %p dentry %p mask %d to mds%d\n",
2514 inode, dentry, mask, session->s_mds);
2515 ceph_mdsc_lease_send_msg(session, inode, dentry,
2516 CEPH_MDS_LEASE_RELEASE, seq);
2517 ceph_put_mds_session(session);
2518}
2519
2520/*
2521 * drop all leases (and dentry refs) in preparation for umount
2522 */
2523static void drop_leases(struct ceph_mds_client *mdsc)
2524{
2525 int i;
2526
2527 dout("drop_leases\n");
2528 mutex_lock(&mdsc->mutex);
2529 for (i = 0; i < mdsc->max_sessions; i++) {
2530 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2531 if (!s)
2532 continue;
2533 mutex_unlock(&mdsc->mutex);
2534 mutex_lock(&s->s_mutex);
2535 mutex_unlock(&s->s_mutex);
2536 ceph_put_mds_session(s);
2537 mutex_lock(&mdsc->mutex);
2538 }
2539 mutex_unlock(&mdsc->mutex);
2540}
2541
2542
2543
2544/*
2545 * delayed work -- periodically trim expired leases, renew caps with mds
2546 */
2547static void schedule_delayed(struct ceph_mds_client *mdsc)
2548{
2549 int delay = 5;
2550 unsigned hz = round_jiffies_relative(HZ * delay);
2551 schedule_delayed_work(&mdsc->delayed_work, hz);
2552}
2553
2554static void delayed_work(struct work_struct *work)
2555{
2556 int i;
2557 struct ceph_mds_client *mdsc =
2558 container_of(work, struct ceph_mds_client, delayed_work.work);
2559 int renew_interval;
2560 int renew_caps;
2561
2562 dout("mdsc delayed_work\n");
2563 ceph_check_delayed_caps(mdsc);
2564
2565 mutex_lock(&mdsc->mutex);
2566 renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
2567 renew_caps = time_after_eq(jiffies, HZ*renew_interval +
2568 mdsc->last_renew_caps);
2569 if (renew_caps)
2570 mdsc->last_renew_caps = jiffies;
2571
2572 for (i = 0; i < mdsc->max_sessions; i++) {
2573 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2574 if (s == NULL)
2575 continue;
2576 if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
2577 dout("resending session close request for mds%d\n",
2578 s->s_mds);
2579 request_close_session(mdsc, s);
2580 ceph_put_mds_session(s);
2581 continue;
2582 }
2583 if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
2584 if (s->s_state == CEPH_MDS_SESSION_OPEN) {
2585 s->s_state = CEPH_MDS_SESSION_HUNG;
2586 pr_info("mds%d hung\n", s->s_mds);
2587 }
2588 }
2589 if (s->s_state < CEPH_MDS_SESSION_OPEN) {
2590 /* this mds is failed or recovering, just wait */
2591 ceph_put_mds_session(s);
2592 continue;
2593 }
2594 mutex_unlock(&mdsc->mutex);
2595
2596 mutex_lock(&s->s_mutex);
2597 if (renew_caps)
2598 send_renew_caps(mdsc, s);
2599 else
2600 ceph_con_keepalive(&s->s_con);
2601 add_cap_releases(mdsc, s, -1);
2602 send_cap_releases(mdsc, s);
2603 mutex_unlock(&s->s_mutex);
2604 ceph_put_mds_session(s);
2605
2606 mutex_lock(&mdsc->mutex);
2607 }
2608 mutex_unlock(&mdsc->mutex);
2609
2610 schedule_delayed(mdsc);
2611}
2612
2613
2614int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2615{
2616 mdsc->client = client;
2617 mutex_init(&mdsc->mutex);
2618 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2619 init_completion(&mdsc->safe_umount_waiters);
2620 init_completion(&mdsc->session_close_waiters);
2621 INIT_LIST_HEAD(&mdsc->waiting_for_map);
2622 mdsc->sessions = NULL;
2623 mdsc->max_sessions = 0;
2624 mdsc->stopping = 0;
2625 init_rwsem(&mdsc->snap_rwsem);
2626 mdsc->snap_realms = RB_ROOT;
2627 INIT_LIST_HEAD(&mdsc->snap_empty);
2628 spin_lock_init(&mdsc->snap_empty_lock);
2629 mdsc->last_tid = 0;
2630 mdsc->request_tree = RB_ROOT;
2631 INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
2632 mdsc->last_renew_caps = jiffies;
2633 INIT_LIST_HEAD(&mdsc->cap_delay_list);
2634 spin_lock_init(&mdsc->cap_delay_lock);
2635 INIT_LIST_HEAD(&mdsc->snap_flush_list);
2636 spin_lock_init(&mdsc->snap_flush_lock);
2637 mdsc->cap_flush_seq = 0;
2638 INIT_LIST_HEAD(&mdsc->cap_dirty);
2639 mdsc->num_cap_flushing = 0;
2640 spin_lock_init(&mdsc->cap_dirty_lock);
2641 init_waitqueue_head(&mdsc->cap_flushing_wq);
2642 spin_lock_init(&mdsc->dentry_lru_lock);
2643 INIT_LIST_HEAD(&mdsc->dentry_lru);
2644 return 0;
2645}
2646
2647/*
2648 * Wait for safe replies on open mds requests. If we time out, drop
2649 * all requests from the tree to avoid dangling dentry refs.
2650 */
2651static void wait_requests(struct ceph_mds_client *mdsc)
2652{
2653 struct ceph_mds_request *req;
2654 struct ceph_client *client = mdsc->client;
2655
2656 mutex_lock(&mdsc->mutex);
2657 if (__get_oldest_req(mdsc)) {
2658 mutex_unlock(&mdsc->mutex);
2659
2660 dout("wait_requests waiting for requests\n");
2661 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
2662 client->mount_args->mount_timeout * HZ);
2663
2664 /* tear down remaining requests */
2665 mutex_lock(&mdsc->mutex);
2666 while ((req = __get_oldest_req(mdsc))) {
2667 dout("wait_requests timed out on tid %llu\n",
2668 req->r_tid);
2669 __unregister_request(mdsc, req);
2670 }
2671 }
2672 mutex_unlock(&mdsc->mutex);
2673 dout("wait_requests done\n");
2674}
2675
2676/*
2677 * called before mount is ro, and before dentries are torn down.
2678 * (hmm, does this still race with new lookups?)
2679 */
2680void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
2681{
2682 dout("pre_umount\n");
2683 mdsc->stopping = 1;
2684
2685 drop_leases(mdsc);
2686 ceph_flush_dirty_caps(mdsc);
2687 wait_requests(mdsc);
2688}
2689
2690/*
2691 * wait for all write mds requests to flush.
2692 */
2693static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
2694{
2695 struct ceph_mds_request *req = NULL, *nextreq;
2696 struct rb_node *n;
2697
2698 mutex_lock(&mdsc->mutex);
2699 dout("wait_unsafe_requests want %lld\n", want_tid);
2700restart:
2701 req = __get_oldest_req(mdsc);
2702 while (req && req->r_tid <= want_tid) {
2703 /* find next request */
2704 n = rb_next(&req->r_node);
2705 if (n)
2706 nextreq = rb_entry(n, struct ceph_mds_request, r_node);
2707 else
2708 nextreq = NULL;
2709 if ((req->r_op & CEPH_MDS_OP_WRITE)) {
2710 /* write op */
2711 ceph_mdsc_get_request(req);
2712 if (nextreq)
2713 ceph_mdsc_get_request(nextreq);
2714 mutex_unlock(&mdsc->mutex);
2715 dout("wait_unsafe_requests wait on %llu (want %llu)\n",
2716 req->r_tid, want_tid);
2717 wait_for_completion(&req->r_safe_completion);
2718 mutex_lock(&mdsc->mutex);
2719 ceph_mdsc_put_request(req);
2720 if (!nextreq)
2721 break; /* next dne before, so we're done! */
2722 if (RB_EMPTY_NODE(&nextreq->r_node)) {
2723 /* next request was removed from tree */
2724 ceph_mdsc_put_request(nextreq);
2725 goto restart;
2726 }
2727 ceph_mdsc_put_request(nextreq); /* won't go away */
2728 }
2729 req = nextreq;
2730 }
2731 mutex_unlock(&mdsc->mutex);
2732 dout("wait_unsafe_requests done\n");
2733}
2734
2735void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2736{
2737 u64 want_tid, want_flush;
2738
2739 dout("sync\n");
2740 mutex_lock(&mdsc->mutex);
2741 want_tid = mdsc->last_tid;
2742 want_flush = mdsc->cap_flush_seq;
2743 mutex_unlock(&mdsc->mutex);
2744 dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
2745
2746 ceph_flush_dirty_caps(mdsc);
2747
2748 wait_unsafe_requests(mdsc, want_tid);
2749 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
2750}
2751
2752
2753/*
2754 * called after sb is ro.
2755 */
2756void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
2757{
2758 struct ceph_mds_session *session;
2759 int i;
2760 int n;
2761 struct ceph_client *client = mdsc->client;
2762 unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
2763
2764 dout("close_sessions\n");
2765
2766 mutex_lock(&mdsc->mutex);
2767
2768 /* close sessions */
2769 started = jiffies;
2770 while (time_before(jiffies, started + timeout)) {
2771 dout("closing sessions\n");
2772 n = 0;
2773 for (i = 0; i < mdsc->max_sessions; i++) {
2774 session = __ceph_lookup_mds_session(mdsc, i);
2775 if (!session)
2776 continue;
2777 mutex_unlock(&mdsc->mutex);
2778 mutex_lock(&session->s_mutex);
2779 __close_session(mdsc, session);
2780 mutex_unlock(&session->s_mutex);
2781 ceph_put_mds_session(session);
2782 mutex_lock(&mdsc->mutex);
2783 n++;
2784 }
2785 if (n == 0)
2786 break;
2787
2788 if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
2789 break;
2790
2791 dout("waiting for sessions to close\n");
2792 mutex_unlock(&mdsc->mutex);
2793 wait_for_completion_timeout(&mdsc->session_close_waiters,
2794 timeout);
2795 mutex_lock(&mdsc->mutex);
2796 }
2797
2798 /* tear down remaining sessions */
2799 for (i = 0; i < mdsc->max_sessions; i++) {
2800 if (mdsc->sessions[i]) {
2801 session = get_session(mdsc->sessions[i]);
2802 __unregister_session(mdsc, session);
2803 mutex_unlock(&mdsc->mutex);
2804 mutex_lock(&session->s_mutex);
2805 remove_session_caps(session);
2806 mutex_unlock(&session->s_mutex);
2807 ceph_put_mds_session(session);
2808 mutex_lock(&mdsc->mutex);
2809 }
2810 }
2811
2812 WARN_ON(!list_empty(&mdsc->cap_delay_list));
2813
2814 mutex_unlock(&mdsc->mutex);
2815
2816 ceph_cleanup_empty_realms(mdsc);
2817
2818 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2819
2820 dout("stopped\n");
2821}
2822
2823void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
2824{
2825 dout("stop\n");
2826 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2827 if (mdsc->mdsmap)
2828 ceph_mdsmap_destroy(mdsc->mdsmap);
2829 kfree(mdsc->sessions);
2830}
2831
2832
2833/*
2834 * handle mds map update.
2835 */
2836void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
2837{
2838 u32 epoch;
2839 u32 maplen;
2840 void *p = msg->front.iov_base;
2841 void *end = p + msg->front.iov_len;
2842 struct ceph_mdsmap *newmap, *oldmap;
2843 struct ceph_fsid fsid;
2844 int err = -EINVAL;
2845
2846 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
2847 ceph_decode_copy(&p, &fsid, sizeof(fsid));
2848 if (ceph_check_fsid(mdsc->client, &fsid) < 0)
2849 return;
2850 epoch = ceph_decode_32(&p);
2851 maplen = ceph_decode_32(&p);
2852 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
2853
2854 /* do we need it? */
2855 ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
2856 mutex_lock(&mdsc->mutex);
2857 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
2858 dout("handle_map epoch %u <= our %u\n",
2859 epoch, mdsc->mdsmap->m_epoch);
2860 mutex_unlock(&mdsc->mutex);
2861 return;
2862 }
2863
2864 newmap = ceph_mdsmap_decode(&p, end);
2865 if (IS_ERR(newmap)) {
2866 err = PTR_ERR(newmap);
2867 goto bad_unlock;
2868 }
2869
2870 /* swap into place */
2871 if (mdsc->mdsmap) {
2872 oldmap = mdsc->mdsmap;
2873 mdsc->mdsmap = newmap;
2874 check_new_map(mdsc, newmap, oldmap);
2875 ceph_mdsmap_destroy(oldmap);
2876 } else {
2877 mdsc->mdsmap = newmap; /* first mds map */
2878 }
2879 mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
2880
2881 __wake_requests(mdsc, &mdsc->waiting_for_map);
2882
2883 mutex_unlock(&mdsc->mutex);
2884 schedule_delayed(mdsc);
2885 return;
2886
2887bad_unlock:
2888 mutex_unlock(&mdsc->mutex);
2889bad:
2890 pr_err("error decoding mdsmap %d\n", err);
2891 return;
2892}
2893
2894static struct ceph_connection *con_get(struct ceph_connection *con)
2895{
2896 struct ceph_mds_session *s = con->private;
2897
2898 if (get_session(s)) {
2899 dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
2900 return con;
2901 }
2902 dout("mdsc con_get %p FAIL\n", s);
2903 return NULL;
2904}
2905
2906static void con_put(struct ceph_connection *con)
2907{
2908 struct ceph_mds_session *s = con->private;
2909
2910 ceph_put_mds_session(s);
2911 dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));
2912}
2913
2914/*
2915 * if the client is unresponsive for long enough, the mds will kill
2916 * the session entirely.
2917 */
2918static void peer_reset(struct ceph_connection *con)
2919{
2920 struct ceph_mds_session *s = con->private;
2921
2922 pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n",
2923 s->s_mds);
2924}
2925
2926static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
2927{
2928 struct ceph_mds_session *s = con->private;
2929 struct ceph_mds_client *mdsc = s->s_mdsc;
2930 int type = le16_to_cpu(msg->hdr.type);
2931
2932 mutex_lock(&mdsc->mutex);
2933 if (__verify_registered_session(mdsc, s) < 0) {
2934 mutex_unlock(&mdsc->mutex);
2935 goto out;
2936 }
2937 mutex_unlock(&mdsc->mutex);
2938
2939 switch (type) {
2940 case CEPH_MSG_MDS_MAP:
2941 ceph_mdsc_handle_map(mdsc, msg);
2942 break;
2943 case CEPH_MSG_CLIENT_SESSION:
2944 handle_session(s, msg);
2945 break;
2946 case CEPH_MSG_CLIENT_REPLY:
2947 handle_reply(s, msg);
2948 break;
2949 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2950 handle_forward(mdsc, s, msg);
2951 break;
2952 case CEPH_MSG_CLIENT_CAPS:
2953 ceph_handle_caps(s, msg);
2954 break;
2955 case CEPH_MSG_CLIENT_SNAP:
2956 ceph_handle_snap(mdsc, s, msg);
2957 break;
2958 case CEPH_MSG_CLIENT_LEASE:
2959 handle_lease(mdsc, s, msg);
2960 break;
2961
2962 default:
2963 pr_err("received unknown message type %d %s\n", type,
2964 ceph_msg_type_name(type));
2965 }
2966out:
2967 ceph_msg_put(msg);
2968}
2969
2970/*
2971 * authentication
2972 */
2973static int get_authorizer(struct ceph_connection *con,
2974 void **buf, int *len, int *proto,
2975 void **reply_buf, int *reply_len, int force_new)
2976{
2977 struct ceph_mds_session *s = con->private;
2978 struct ceph_mds_client *mdsc = s->s_mdsc;
2979 struct ceph_auth_client *ac = mdsc->client->monc.auth;
2980 int ret = 0;
2981
2982 if (force_new && s->s_authorizer) {
2983 ac->ops->destroy_authorizer(ac, s->s_authorizer);
2984 s->s_authorizer = NULL;
2985 }
2986 if (s->s_authorizer == NULL) {
2987 if (ac->ops->create_authorizer) {
2988 ret = ac->ops->create_authorizer(
2989 ac, CEPH_ENTITY_TYPE_MDS,
2990 &s->s_authorizer,
2991 &s->s_authorizer_buf,
2992 &s->s_authorizer_buf_len,
2993 &s->s_authorizer_reply_buf,
2994 &s->s_authorizer_reply_buf_len);
2995 if (ret)
2996 return ret;
2997 }
2998 }
2999
3000 *proto = ac->protocol;
3001 *buf = s->s_authorizer_buf;
3002 *len = s->s_authorizer_buf_len;
3003 *reply_buf = s->s_authorizer_reply_buf;
3004 *reply_len = s->s_authorizer_reply_buf_len;
3005 return 0;
3006}
3007
3008
3009static int verify_authorizer_reply(struct ceph_connection *con, int len)
3010{
3011 struct ceph_mds_session *s = con->private;
3012 struct ceph_mds_client *mdsc = s->s_mdsc;
3013 struct ceph_auth_client *ac = mdsc->client->monc.auth;
3014
3015 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
3016}
3017
3018static int invalidate_authorizer(struct ceph_connection *con)
3019{
3020 struct ceph_mds_session *s = con->private;
3021 struct ceph_mds_client *mdsc = s->s_mdsc;
3022 struct ceph_auth_client *ac = mdsc->client->monc.auth;
3023
3024 if (ac->ops->invalidate_authorizer)
3025 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3026
3027 return ceph_monc_validate_auth(&mdsc->client->monc);
3028}
3029
3030const static struct ceph_connection_operations mds_con_ops = {
3031 .get = con_get,
3032 .put = con_put,
3033 .dispatch = dispatch,
3034 .get_authorizer = get_authorizer,
3035 .verify_authorizer_reply = verify_authorizer_reply,
3036 .invalidate_authorizer = invalidate_authorizer,
3037 .peer_reset = peer_reset,
3038};
3039
3040
3041
3042
3043/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 000000000000..961cc6f65878
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,335 @@
1#ifndef _FS_CEPH_MDS_CLIENT_H
2#define _FS_CEPH_MDS_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/list.h>
7#include <linux/mutex.h>
8#include <linux/rbtree.h>
9#include <linux/spinlock.h>
10
11#include "types.h"
12#include "messenger.h"
13#include "mdsmap.h"
14
15/*
16 * Some lock dependencies:
17 *
18 * session->s_mutex
19 * mdsc->mutex
20 *
21 * mdsc->snap_rwsem
22 *
23 * inode->i_lock
24 * mdsc->snap_flush_lock
25 * mdsc->cap_delay_lock
26 *
27 */
28
29struct ceph_client;
30struct ceph_cap;
31
32/*
33 * parsed info about a single inode. pointers are into the encoded
34 * on-wire structures within the mds reply message payload.
35 */
36struct ceph_mds_reply_info_in {
37 struct ceph_mds_reply_inode *in;
38 u32 symlink_len;
39 char *symlink;
40 u32 xattr_len;
41 char *xattr_data;
42};
43
44/*
45 * parsed info about an mds reply, including information about the
46 * target inode and/or its parent directory and dentry, and directory
47 * contents (for readdir results).
48 */
49struct ceph_mds_reply_info_parsed {
50 struct ceph_mds_reply_head *head;
51
52 struct ceph_mds_reply_info_in diri, targeti;
53 struct ceph_mds_reply_dirfrag *dirfrag;
54 char *dname;
55 u32 dname_len;
56 struct ceph_mds_reply_lease *dlease;
57
58 struct ceph_mds_reply_dirfrag *dir_dir;
59 int dir_nr;
60 char **dir_dname;
61 u32 *dir_dname_len;
62 struct ceph_mds_reply_lease **dir_dlease;
63 struct ceph_mds_reply_info_in *dir_in;
64 u8 dir_complete, dir_end;
65
66 /* encoded blob describing snapshot contexts for certain
67 operations (e.g., open) */
68 void *snapblob;
69 int snapblob_len;
70};
71
72
73/*
74 * cap releases are batched and sent to the MDS en masse.
75 */
76#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
77 sizeof(struct ceph_mds_cap_release)) / \
78 sizeof(struct ceph_mds_cap_item))
79
80
81/*
82 * state associated with each MDS<->client session
83 */
84enum {
85 CEPH_MDS_SESSION_NEW = 1,
86 CEPH_MDS_SESSION_OPENING = 2,
87 CEPH_MDS_SESSION_OPEN = 3,
88 CEPH_MDS_SESSION_HUNG = 4,
89 CEPH_MDS_SESSION_CLOSING = 5,
90 CEPH_MDS_SESSION_RESTARTING = 6,
91 CEPH_MDS_SESSION_RECONNECTING = 7,
92};
93
94struct ceph_mds_session {
95 struct ceph_mds_client *s_mdsc;
96 int s_mds;
97 int s_state;
98 unsigned long s_ttl; /* time until mds kills us */
99 u64 s_seq; /* incoming msg seq # */
100 struct mutex s_mutex; /* serialize session messages */
101
102 struct ceph_connection s_con;
103
104 struct ceph_authorizer *s_authorizer;
105 void *s_authorizer_buf, *s_authorizer_reply_buf;
106 size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
107
108 /* protected by s_cap_lock */
109 spinlock_t s_cap_lock;
110 u32 s_cap_gen; /* inc each time we get mds stale msg */
111 unsigned long s_cap_ttl; /* when session caps expire */
112 struct list_head s_caps; /* all caps issued by this session */
113 int s_nr_caps, s_trim_caps;
114 int s_num_cap_releases;
115 struct list_head s_cap_releases; /* waiting cap_release messages */
116 struct list_head s_cap_releases_done; /* ready to send */
117 struct ceph_cap *s_cap_iterator;
118
119 /* protected by mutex */
120 struct list_head s_cap_flushing; /* inodes w/ flushing caps */
121 struct list_head s_cap_snaps_flushing;
122 unsigned long s_renew_requested; /* last time we sent a renew req */
123 u64 s_renew_seq;
124
125 atomic_t s_ref;
126 struct list_head s_waiting; /* waiting requests */
127 struct list_head s_unsafe; /* unsafe requests */
128};
129
130/*
131 * modes of choosing which MDS to send a request to
132 */
133enum {
134 USE_ANY_MDS,
135 USE_RANDOM_MDS,
136 USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
137};
138
139struct ceph_mds_request;
140struct ceph_mds_client;
141
142/*
143 * request completion callback
144 */
145typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
146 struct ceph_mds_request *req);
147
148/*
149 * an in-flight mds request
150 */
151struct ceph_mds_request {
152 u64 r_tid; /* transaction id */
153 struct rb_node r_node;
154
155 int r_op; /* mds op code */
156 int r_mds;
157
158 /* operation on what? */
159 struct inode *r_inode; /* arg1 */
160 struct dentry *r_dentry; /* arg1 */
161 struct dentry *r_old_dentry; /* arg2: rename from or link from */
162 char *r_path1, *r_path2;
163 struct ceph_vino r_ino1, r_ino2;
164
165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
166 struct inode *r_target_inode; /* resulting inode */
167
168 union ceph_mds_request_args r_args;
169 int r_fmode; /* file mode, if expecting cap */
170
171 /* for choosing which mds to send this request to */
172 int r_direct_mode;
173 u32 r_direct_hash; /* choose dir frag based on this dentry hash */
174 bool r_direct_is_hash; /* true if r_direct_hash is valid */
175
176 /* data payload is used for xattr ops */
177 struct page **r_pages;
178 int r_num_pages;
179 int r_data_len;
180
181 /* what caps shall we drop? */
182 int r_inode_drop, r_inode_unless;
183 int r_dentry_drop, r_dentry_unless;
184 int r_old_dentry_drop, r_old_dentry_unless;
185 struct inode *r_old_inode;
186 int r_old_inode_drop, r_old_inode_unless;
187
188 struct ceph_msg *r_request; /* original request */
189 struct ceph_msg *r_reply;
190 struct ceph_mds_reply_info_parsed r_reply_info;
191 int r_err;
192 bool r_aborted;
193
194 unsigned long r_timeout; /* optional. jiffies */
195 unsigned long r_started; /* start time to measure timeout against */
196 unsigned long r_request_started; /* start time for mds request only,
197 used to measure lease durations */
198
199 /* link unsafe requests to parent directory, for fsync */
200 struct inode *r_unsafe_dir;
201 struct list_head r_unsafe_dir_item;
202
203 struct ceph_mds_session *r_session;
204
205 int r_attempts; /* resend attempts */
206 int r_num_fwd; /* number of forward attempts */
207 int r_num_stale;
208 int r_resend_mds; /* mds to resend to next, if any*/
209
210 struct kref r_kref;
211 struct list_head r_wait;
212 struct completion r_completion;
213 struct completion r_safe_completion;
214 ceph_mds_request_callback_t r_callback;
215 struct list_head r_unsafe_item; /* per-session unsafe list item */
216 bool r_got_unsafe, r_got_safe;
217
218 bool r_did_prepopulate;
219 u32 r_readdir_offset;
220
221 struct ceph_cap_reservation r_caps_reservation;
222 int r_num_caps;
223};
224
225/*
226 * mds client state
227 */
228struct ceph_mds_client {
229 struct ceph_client *client;
230 struct mutex mutex; /* all nested structures */
231
232 struct ceph_mdsmap *mdsmap;
233 struct completion safe_umount_waiters, session_close_waiters;
234 struct list_head waiting_for_map;
235
236 struct ceph_mds_session **sessions; /* NULL for mds if no session */
237 int max_sessions; /* len of s_mds_sessions */
238 int stopping; /* true if shutting down */
239
240 /*
241 * snap_rwsem will cover cap linkage into snaprealms, and
242 * realm snap contexts. (later, we can do per-realm snap
243 * contexts locks..) the empty list contains realms with no
244 * references (implying they contain no inodes with caps) that
245 * should be destroyed.
246 */
247 struct rw_semaphore snap_rwsem;
248 struct rb_root snap_realms;
249 struct list_head snap_empty;
250 spinlock_t snap_empty_lock; /* protect snap_empty */
251
252 u64 last_tid; /* most recent mds request */
253 struct rb_root request_tree; /* pending mds requests */
254 struct delayed_work delayed_work; /* delayed work */
255 unsigned long last_renew_caps; /* last time we renewed our caps */
256 struct list_head cap_delay_list; /* caps with delayed release */
257 spinlock_t cap_delay_lock; /* protects cap_delay_list */
258 struct list_head snap_flush_list; /* cap_snaps ready to flush */
259 spinlock_t snap_flush_lock;
260
261 u64 cap_flush_seq;
262 struct list_head cap_dirty; /* inodes with dirty caps */
263 int num_cap_flushing; /* # caps we are flushing */
264 spinlock_t cap_dirty_lock; /* protects above items */
265 wait_queue_head_t cap_flushing_wq;
266
267#ifdef CONFIG_DEBUG_FS
268 struct dentry *debugfs_file;
269#endif
270
271 spinlock_t dentry_lru_lock;
272 struct list_head dentry_lru;
273 int num_dentry;
274};
275
276extern const char *ceph_mds_op_name(int op);
277
278extern struct ceph_mds_session *
279__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
280
281static inline struct ceph_mds_session *
282ceph_get_mds_session(struct ceph_mds_session *s)
283{
284 atomic_inc(&s->s_ref);
285 return s;
286}
287
288extern void ceph_put_mds_session(struct ceph_mds_session *s);
289
290extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
291 struct ceph_msg *msg, int mds);
292
293extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
294 struct ceph_client *client);
295extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
296extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
297
298extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
299
300extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
301 struct inode *inode,
302 struct dentry *dn, int mask);
303
304extern struct ceph_mds_request *
305ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
306extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
307 struct ceph_mds_request *req);
308extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
309 struct inode *dir,
310 struct ceph_mds_request *req);
311static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
312{
313 kref_get(&req->r_kref);
314}
315extern void ceph_mdsc_release_request(struct kref *kref);
316static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
317{
318 kref_put(&req->r_kref, ceph_mdsc_release_request);
319}
320
321extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
322
323extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
324 int stop_on_nosnap);
325
326extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
327extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
328 struct inode *inode,
329 struct dentry *dentry, char action,
330 u32 seq);
331
332extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
333 struct ceph_msg *msg);
334
335#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 000000000000..c4c498e6dfef
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,174 @@
1#include "ceph_debug.h"
2
3#include <linux/bug.h>
4#include <linux/err.h>
5#include <linux/random.h>
6#include <linux/slab.h>
7#include <linux/types.h>
8
9#include "mdsmap.h"
10#include "messenger.h"
11#include "decode.h"
12
13#include "super.h"
14
15
16/*
17 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
18 */
19int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
20{
21 int n = 0;
22 int i;
23 char r;
24
25 /* count */
26 for (i = 0; i < m->m_max_mds; i++)
27 if (m->m_info[i].state > 0)
28 n++;
29 if (n == 0)
30 return -1;
31
32 /* pick */
33 get_random_bytes(&r, 1);
34 n = r % n;
35 i = 0;
36 for (i = 0; n > 0; i++, n--)
37 while (m->m_info[i].state <= 0)
38 i++;
39
40 return i;
41}
42
43/*
44 * Decode an MDS map
45 *
46 * Ignore any fields we don't care about (there are quite a few of
47 * them).
48 */
49struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
50{
51 struct ceph_mdsmap *m;
52 const void *start = *p;
53 int i, j, n;
54 int err = -EINVAL;
55 u16 version;
56
57 m = kzalloc(sizeof(*m), GFP_NOFS);
58 if (m == NULL)
59 return ERR_PTR(-ENOMEM);
60
61 ceph_decode_16_safe(p, end, version, bad);
62
63 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
64 m->m_epoch = ceph_decode_32(p);
65 m->m_client_epoch = ceph_decode_32(p);
66 m->m_last_failure = ceph_decode_32(p);
67 m->m_root = ceph_decode_32(p);
68 m->m_session_timeout = ceph_decode_32(p);
69 m->m_session_autoclose = ceph_decode_32(p);
70 m->m_max_file_size = ceph_decode_64(p);
71 m->m_max_mds = ceph_decode_32(p);
72
73 m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
74 if (m->m_info == NULL)
75 goto badmem;
76
77 /* pick out active nodes from mds_info (state > 0) */
78 n = ceph_decode_32(p);
79 for (i = 0; i < n; i++) {
80 u64 global_id;
81 u32 namelen;
82 s32 mds, inc, state;
83 u64 state_seq;
84 u8 infoversion;
85 struct ceph_entity_addr addr;
86 u32 num_export_targets;
87 void *pexport_targets = NULL;
88
89 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
90 global_id = ceph_decode_64(p);
91 infoversion = ceph_decode_8(p);
92 *p += sizeof(u64);
93 namelen = ceph_decode_32(p); /* skip mds name */
94 *p += namelen;
95
96 ceph_decode_need(p, end,
97 4*sizeof(u32) + sizeof(u64) +
98 sizeof(addr) + sizeof(struct ceph_timespec),
99 bad);
100 mds = ceph_decode_32(p);
101 inc = ceph_decode_32(p);
102 state = ceph_decode_32(p);
103 state_seq = ceph_decode_64(p);
104 ceph_decode_copy(p, &addr, sizeof(addr));
105 ceph_decode_addr(&addr);
106 *p += sizeof(struct ceph_timespec);
107 *p += sizeof(u32);
108 ceph_decode_32_safe(p, end, namelen, bad);
109 *p += namelen;
110 if (infoversion >= 2) {
111 ceph_decode_32_safe(p, end, num_export_targets, bad);
112 pexport_targets = *p;
113 *p += num_export_targets * sizeof(u32);
114 } else {
115 num_export_targets = 0;
116 }
117
118 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
119 i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
120 ceph_mds_state_name(state));
121 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
122 m->m_info[mds].global_id = global_id;
123 m->m_info[mds].state = state;
124 m->m_info[mds].addr = addr;
125 m->m_info[mds].num_export_targets = num_export_targets;
126 if (num_export_targets) {
127 m->m_info[mds].export_targets =
128 kcalloc(num_export_targets, sizeof(u32),
129 GFP_NOFS);
130 for (j = 0; j < num_export_targets; j++)
131 m->m_info[mds].export_targets[j] =
132 ceph_decode_32(&pexport_targets);
133 } else {
134 m->m_info[mds].export_targets = NULL;
135 }
136 }
137 }
138
139 /* pg_pools */
140 ceph_decode_32_safe(p, end, n, bad);
141 m->m_num_data_pg_pools = n;
142 m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
143 if (!m->m_data_pg_pools)
144 goto badmem;
145 ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
146 for (i = 0; i < n; i++)
147 m->m_data_pg_pools[i] = ceph_decode_32(p);
148 m->m_cas_pg_pool = ceph_decode_32(p);
149
150 /* ok, we don't care about the rest. */
151 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
152 return m;
153
154badmem:
155 err = -ENOMEM;
156bad:
157 pr_err("corrupt mdsmap\n");
158 print_hex_dump(KERN_DEBUG, "mdsmap: ",
159 DUMP_PREFIX_OFFSET, 16, 1,
160 start, end - start, true);
161 ceph_mdsmap_destroy(m);
162 return ERR_PTR(-EINVAL);
163}
164
165void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
166{
167 int i;
168
169 for (i = 0; i < m->m_max_mds; i++)
170 kfree(m->m_info[i].export_targets);
171 kfree(m->m_info);
172 kfree(m->m_data_pg_pools);
173 kfree(m);
174}
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..eacc131aa5cb
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,54 @@
1#ifndef _FS_CEPH_MDSMAP_H
2#define _FS_CEPH_MDSMAP_H
3
4#include "types.h"
5
6/*
7 * mds map - describe servers in the mds cluster.
8 *
9 * we limit fields to those the client actually xcares about
10 */
11struct ceph_mds_info {
12 u64 global_id;
13 struct ceph_entity_addr addr;
14 s32 state;
15 int num_export_targets;
16 u32 *export_targets;
17};
18
19struct ceph_mdsmap {
20 u32 m_epoch, m_client_epoch, m_last_failure;
21 u32 m_root;
22 u32 m_session_timeout; /* seconds */
23 u32 m_session_autoclose; /* seconds */
24 u64 m_max_file_size;
25 u32 m_max_mds; /* size of m_addr, m_state arrays */
26 struct ceph_mds_info *m_info;
27
28 /* which object pools file data can be stored in */
29 int m_num_data_pg_pools;
30 u32 *m_data_pg_pools;
31 u32 m_cas_pg_pool;
32};
33
34static inline struct ceph_entity_addr *
35ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
36{
37 if (w >= m->m_max_mds)
38 return NULL;
39 return &m->m_info[w].addr;
40}
41
42static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
43{
44 BUG_ON(w < 0);
45 if (w >= m->m_max_mds)
46 return CEPH_MDS_STATE_DNE;
47 return m->m_info[w].state;
48}
49
50extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
51extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
52extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
53
54#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
new file mode 100644
index 000000000000..cdaaa131add3
--- /dev/null
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2249 @@
1#include "ceph_debug.h"
2
3#include <linux/crc32c.h>
4#include <linux/ctype.h>
5#include <linux/highmem.h>
6#include <linux/inet.h>
7#include <linux/kthread.h>
8#include <linux/net.h>
9#include <linux/slab.h>
10#include <linux/socket.h>
11#include <linux/string.h>
12#include <net/tcp.h>
13
14#include "super.h"
15#include "messenger.h"
16#include "decode.h"
17#include "pagelist.h"
18
19/*
20 * Ceph uses the messenger to exchange ceph_msg messages with other
21 * hosts in the system. The messenger provides ordered and reliable
22 * delivery. We tolerate TCP disconnects by reconnecting (with
23 * exponential backoff) in the case of a fault (disconnection, bad
24 * crc, protocol error). Acks allow sent messages to be discarded by
25 * the sender.
26 */
27
28/* static tag bytes (protocol control messages) */
29static char tag_msg = CEPH_MSGR_TAG_MSG;
30static char tag_ack = CEPH_MSGR_TAG_ACK;
31static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
32
33#ifdef CONFIG_LOCKDEP
34static struct lock_class_key socket_class;
35#endif
36
37
38static void queue_con(struct ceph_connection *con);
39static void con_work(struct work_struct *);
40static void ceph_fault(struct ceph_connection *con);
41
42const char *ceph_name_type_str(int t)
43{
44 switch (t) {
45 case CEPH_ENTITY_TYPE_MON: return "mon";
46 case CEPH_ENTITY_TYPE_MDS: return "mds";
47 case CEPH_ENTITY_TYPE_OSD: return "osd";
48 case CEPH_ENTITY_TYPE_CLIENT: return "client";
49 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
50 default: return "???";
51 }
52}
53
54/*
55 * nicely render a sockaddr as a string.
56 */
57#define MAX_ADDR_STR 20
58static char addr_str[MAX_ADDR_STR][40];
59static DEFINE_SPINLOCK(addr_str_lock);
60static int last_addr_str;
61
62const char *pr_addr(const struct sockaddr_storage *ss)
63{
64 int i;
65 char *s;
66 struct sockaddr_in *in4 = (void *)ss;
67 unsigned char *quad = (void *)&in4->sin_addr.s_addr;
68 struct sockaddr_in6 *in6 = (void *)ss;
69
70 spin_lock(&addr_str_lock);
71 i = last_addr_str++;
72 if (last_addr_str == MAX_ADDR_STR)
73 last_addr_str = 0;
74 spin_unlock(&addr_str_lock);
75 s = addr_str[i];
76
77 switch (ss->ss_family) {
78 case AF_INET:
79 sprintf(s, "%u.%u.%u.%u:%u",
80 (unsigned int)quad[0],
81 (unsigned int)quad[1],
82 (unsigned int)quad[2],
83 (unsigned int)quad[3],
84 (unsigned int)ntohs(in4->sin_port));
85 break;
86
87 case AF_INET6:
88 sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
89 in6->sin6_addr.s6_addr16[0],
90 in6->sin6_addr.s6_addr16[1],
91 in6->sin6_addr.s6_addr16[2],
92 in6->sin6_addr.s6_addr16[3],
93 in6->sin6_addr.s6_addr16[4],
94 in6->sin6_addr.s6_addr16[5],
95 in6->sin6_addr.s6_addr16[6],
96 in6->sin6_addr.s6_addr16[7],
97 (unsigned int)ntohs(in6->sin6_port));
98 break;
99
100 default:
101 sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
102 }
103
104 return s;
105}
106
107static void encode_my_addr(struct ceph_messenger *msgr)
108{
109 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
110 ceph_encode_addr(&msgr->my_enc_addr);
111}
112
113/*
114 * work queue for all reading and writing to/from the socket.
115 */
116struct workqueue_struct *ceph_msgr_wq;
117
118int __init ceph_msgr_init(void)
119{
120 ceph_msgr_wq = create_workqueue("ceph-msgr");
121 if (IS_ERR(ceph_msgr_wq)) {
122 int ret = PTR_ERR(ceph_msgr_wq);
123 pr_err("msgr_init failed to create workqueue: %d\n", ret);
124 ceph_msgr_wq = NULL;
125 return ret;
126 }
127 return 0;
128}
129
130void ceph_msgr_exit(void)
131{
132 destroy_workqueue(ceph_msgr_wq);
133}
134
135/*
136 * socket callback functions
137 */
138
139/* data available on socket, or listen socket received a connect */
140static void ceph_data_ready(struct sock *sk, int count_unused)
141{
142 struct ceph_connection *con =
143 (struct ceph_connection *)sk->sk_user_data;
144 if (sk->sk_state != TCP_CLOSE_WAIT) {
145 dout("ceph_data_ready on %p state = %lu, queueing work\n",
146 con, con->state);
147 queue_con(con);
148 }
149}
150
151/* socket has buffer space for writing */
152static void ceph_write_space(struct sock *sk)
153{
154 struct ceph_connection *con =
155 (struct ceph_connection *)sk->sk_user_data;
156
157 /* only queue to workqueue if there is data we want to write. */
158 if (test_bit(WRITE_PENDING, &con->state)) {
159 dout("ceph_write_space %p queueing write work\n", con);
160 queue_con(con);
161 } else {
162 dout("ceph_write_space %p nothing to write\n", con);
163 }
164
165 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
166 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
167}
168
169/* socket's state has changed */
170static void ceph_state_change(struct sock *sk)
171{
172 struct ceph_connection *con =
173 (struct ceph_connection *)sk->sk_user_data;
174
175 dout("ceph_state_change %p state = %lu sk_state = %u\n",
176 con, con->state, sk->sk_state);
177
178 if (test_bit(CLOSED, &con->state))
179 return;
180
181 switch (sk->sk_state) {
182 case TCP_CLOSE:
183 dout("ceph_state_change TCP_CLOSE\n");
184 case TCP_CLOSE_WAIT:
185 dout("ceph_state_change TCP_CLOSE_WAIT\n");
186 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
187 if (test_bit(CONNECTING, &con->state))
188 con->error_msg = "connection failed";
189 else
190 con->error_msg = "socket closed";
191 queue_con(con);
192 }
193 break;
194 case TCP_ESTABLISHED:
195 dout("ceph_state_change TCP_ESTABLISHED\n");
196 queue_con(con);
197 break;
198 }
199}
200
201/*
202 * set up socket callbacks
203 */
204static void set_sock_callbacks(struct socket *sock,
205 struct ceph_connection *con)
206{
207 struct sock *sk = sock->sk;
208 sk->sk_user_data = (void *)con;
209 sk->sk_data_ready = ceph_data_ready;
210 sk->sk_write_space = ceph_write_space;
211 sk->sk_state_change = ceph_state_change;
212}
213
214
215/*
216 * socket helpers
217 */
218
219/*
220 * initiate connection to a remote socket.
221 */
222static struct socket *ceph_tcp_connect(struct ceph_connection *con)
223{
224 struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
225 struct socket *sock;
226 int ret;
227
228 BUG_ON(con->sock);
229 ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
230 if (ret)
231 return ERR_PTR(ret);
232 con->sock = sock;
233 sock->sk->sk_allocation = GFP_NOFS;
234
235#ifdef CONFIG_LOCKDEP
236 lockdep_set_class(&sock->sk->sk_lock, &socket_class);
237#endif
238
239 set_sock_callbacks(sock, con);
240
241 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
242
243 ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
244 if (ret == -EINPROGRESS) {
245 dout("connect %s EINPROGRESS sk_state = %u\n",
246 pr_addr(&con->peer_addr.in_addr),
247 sock->sk->sk_state);
248 ret = 0;
249 }
250 if (ret < 0) {
251 pr_err("connect %s error %d\n",
252 pr_addr(&con->peer_addr.in_addr), ret);
253 sock_release(sock);
254 con->sock = NULL;
255 con->error_msg = "connect error";
256 }
257
258 if (ret < 0)
259 return ERR_PTR(ret);
260 return sock;
261}
262
263static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
264{
265 struct kvec iov = {buf, len};
266 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
267
268 return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
269}
270
271/*
272 * write something. @more is true if caller will be sending more data
273 * shortly.
274 */
275static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
276 size_t kvlen, size_t len, int more)
277{
278 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
279
280 if (more)
281 msg.msg_flags |= MSG_MORE;
282 else
283 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
284
285 return kernel_sendmsg(sock, &msg, iov, kvlen, len);
286}
287
288
289/*
290 * Shutdown/close the socket for the given connection.
291 */
292static int con_close_socket(struct ceph_connection *con)
293{
294 int rc;
295
296 dout("con_close_socket on %p sock %p\n", con, con->sock);
297 if (!con->sock)
298 return 0;
299 set_bit(SOCK_CLOSED, &con->state);
300 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
301 sock_release(con->sock);
302 con->sock = NULL;
303 clear_bit(SOCK_CLOSED, &con->state);
304 return rc;
305}
306
307/*
308 * Reset a connection. Discard all incoming and outgoing messages
309 * and clear *_seq state.
310 */
311static void ceph_msg_remove(struct ceph_msg *msg)
312{
313 list_del_init(&msg->list_head);
314 ceph_msg_put(msg);
315}
316static void ceph_msg_remove_list(struct list_head *head)
317{
318 while (!list_empty(head)) {
319 struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
320 list_head);
321 ceph_msg_remove(msg);
322 }
323}
324
325static void reset_connection(struct ceph_connection *con)
326{
327 /* reset connection, out_queue, msg_ and connect_seq */
328 /* discard existing out_queue and msg_seq */
329 ceph_msg_remove_list(&con->out_queue);
330 ceph_msg_remove_list(&con->out_sent);
331
332 if (con->in_msg) {
333 ceph_msg_put(con->in_msg);
334 con->in_msg = NULL;
335 }
336
337 con->connect_seq = 0;
338 con->out_seq = 0;
339 if (con->out_msg) {
340 ceph_msg_put(con->out_msg);
341 con->out_msg = NULL;
342 }
343 con->in_seq = 0;
344 con->in_seq_acked = 0;
345}
346
347/*
348 * mark a peer down. drop any open connections.
349 */
350void ceph_con_close(struct ceph_connection *con)
351{
352 dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
353 set_bit(CLOSED, &con->state); /* in case there's queued work */
354 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
355 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
356 clear_bit(KEEPALIVE_PENDING, &con->state);
357 clear_bit(WRITE_PENDING, &con->state);
358 mutex_lock(&con->mutex);
359 reset_connection(con);
360 cancel_delayed_work(&con->work);
361 mutex_unlock(&con->mutex);
362 queue_con(con);
363}
364
365/*
366 * Reopen a closed connection, with a new peer address.
367 */
368void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
369{
370 dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
371 set_bit(OPENING, &con->state);
372 clear_bit(CLOSED, &con->state);
373 memcpy(&con->peer_addr, addr, sizeof(*addr));
374 con->delay = 0; /* reset backoff memory */
375 queue_con(con);
376}
377
378/*
379 * return true if this connection ever successfully opened
380 */
381bool ceph_con_opened(struct ceph_connection *con)
382{
383 return con->connect_seq > 0;
384}
385
386/*
387 * generic get/put
388 */
389struct ceph_connection *ceph_con_get(struct ceph_connection *con)
390{
391 dout("con_get %p nref = %d -> %d\n", con,
392 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
393 if (atomic_inc_not_zero(&con->nref))
394 return con;
395 return NULL;
396}
397
398void ceph_con_put(struct ceph_connection *con)
399{
400 dout("con_put %p nref = %d -> %d\n", con,
401 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
402 BUG_ON(atomic_read(&con->nref) == 0);
403 if (atomic_dec_and_test(&con->nref)) {
404 BUG_ON(con->sock);
405 kfree(con);
406 }
407}
408
409/*
410 * initialize a new connection.
411 */
412void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
413{
414 dout("con_init %p\n", con);
415 memset(con, 0, sizeof(*con));
416 atomic_set(&con->nref, 1);
417 con->msgr = msgr;
418 mutex_init(&con->mutex);
419 INIT_LIST_HEAD(&con->out_queue);
420 INIT_LIST_HEAD(&con->out_sent);
421 INIT_DELAYED_WORK(&con->work, con_work);
422}
423
424
425/*
426 * We maintain a global counter to order connection attempts. Get
427 * a unique seq greater than @gt.
428 */
429static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
430{
431 u32 ret;
432
433 spin_lock(&msgr->global_seq_lock);
434 if (msgr->global_seq < gt)
435 msgr->global_seq = gt;
436 ret = ++msgr->global_seq;
437 spin_unlock(&msgr->global_seq_lock);
438 return ret;
439}
440
441
442/*
443 * Prepare footer for currently outgoing message, and finish things
444 * off. Assumes out_kvec* are already valid.. we just add on to the end.
445 */
446static void prepare_write_message_footer(struct ceph_connection *con, int v)
447{
448 struct ceph_msg *m = con->out_msg;
449
450 dout("prepare_write_message_footer %p\n", con);
451 con->out_kvec_is_msg = true;
452 con->out_kvec[v].iov_base = &m->footer;
453 con->out_kvec[v].iov_len = sizeof(m->footer);
454 con->out_kvec_bytes += sizeof(m->footer);
455 con->out_kvec_left++;
456 con->out_more = m->more_to_follow;
457 con->out_msg_done = true;
458}
459
460/*
461 * Prepare headers for the next outgoing message.
462 */
463static void prepare_write_message(struct ceph_connection *con)
464{
465 struct ceph_msg *m;
466 int v = 0;
467
468 con->out_kvec_bytes = 0;
469 con->out_kvec_is_msg = true;
470 con->out_msg_done = false;
471
472 /* Sneak an ack in there first? If we can get it into the same
473 * TCP packet that's a good thing. */
474 if (con->in_seq > con->in_seq_acked) {
475 con->in_seq_acked = con->in_seq;
476 con->out_kvec[v].iov_base = &tag_ack;
477 con->out_kvec[v++].iov_len = 1;
478 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
479 con->out_kvec[v].iov_base = &con->out_temp_ack;
480 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
481 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
482 }
483
484 m = list_first_entry(&con->out_queue,
485 struct ceph_msg, list_head);
486 con->out_msg = m;
487 if (test_bit(LOSSYTX, &con->state)) {
488 list_del_init(&m->list_head);
489 } else {
490 /* put message on sent list */
491 ceph_msg_get(m);
492 list_move_tail(&m->list_head, &con->out_sent);
493 }
494
495 m->hdr.seq = cpu_to_le64(++con->out_seq);
496
497 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
498 m, con->out_seq, le16_to_cpu(m->hdr.type),
499 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
500 le32_to_cpu(m->hdr.data_len),
501 m->nr_pages);
502 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
503
504 /* tag + hdr + front + middle */
505 con->out_kvec[v].iov_base = &tag_msg;
506 con->out_kvec[v++].iov_len = 1;
507 con->out_kvec[v].iov_base = &m->hdr;
508 con->out_kvec[v++].iov_len = sizeof(m->hdr);
509 con->out_kvec[v++] = m->front;
510 if (m->middle)
511 con->out_kvec[v++] = m->middle->vec;
512 con->out_kvec_left = v;
513 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
514 (m->middle ? m->middle->vec.iov_len : 0);
515 con->out_kvec_cur = con->out_kvec;
516
517 /* fill in crc (except data pages), footer */
518 con->out_msg->hdr.crc =
519 cpu_to_le32(crc32c(0, (void *)&m->hdr,
520 sizeof(m->hdr) - sizeof(m->hdr.crc)));
521 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
522 con->out_msg->footer.front_crc =
523 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
524 if (m->middle)
525 con->out_msg->footer.middle_crc =
526 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
527 m->middle->vec.iov_len));
528 else
529 con->out_msg->footer.middle_crc = 0;
530 con->out_msg->footer.data_crc = 0;
531 dout("prepare_write_message front_crc %u data_crc %u\n",
532 le32_to_cpu(con->out_msg->footer.front_crc),
533 le32_to_cpu(con->out_msg->footer.middle_crc));
534
535 /* is there a data payload? */
536 if (le32_to_cpu(m->hdr.data_len) > 0) {
537 /* initialize page iterator */
538 con->out_msg_pos.page = 0;
539 con->out_msg_pos.page_pos =
540 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
541 con->out_msg_pos.data_pos = 0;
542 con->out_msg_pos.did_page_crc = 0;
543 con->out_more = 1; /* data + footer will follow */
544 } else {
545 /* no, queue up footer too and be done */
546 prepare_write_message_footer(con, v);
547 }
548
549 set_bit(WRITE_PENDING, &con->state);
550}
551
552/*
553 * Prepare an ack.
554 */
555static void prepare_write_ack(struct ceph_connection *con)
556{
557 dout("prepare_write_ack %p %llu -> %llu\n", con,
558 con->in_seq_acked, con->in_seq);
559 con->in_seq_acked = con->in_seq;
560
561 con->out_kvec[0].iov_base = &tag_ack;
562 con->out_kvec[0].iov_len = 1;
563 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
564 con->out_kvec[1].iov_base = &con->out_temp_ack;
565 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
566 con->out_kvec_left = 2;
567 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
568 con->out_kvec_cur = con->out_kvec;
569 con->out_more = 1; /* more will follow.. eventually.. */
570 set_bit(WRITE_PENDING, &con->state);
571}
572
573/*
574 * Prepare to write keepalive byte.
575 */
576static void prepare_write_keepalive(struct ceph_connection *con)
577{
578 dout("prepare_write_keepalive %p\n", con);
579 con->out_kvec[0].iov_base = &tag_keepalive;
580 con->out_kvec[0].iov_len = 1;
581 con->out_kvec_left = 1;
582 con->out_kvec_bytes = 1;
583 con->out_kvec_cur = con->out_kvec;
584 set_bit(WRITE_PENDING, &con->state);
585}
586
587/*
588 * Connection negotiation.
589 */
590
591static void prepare_connect_authorizer(struct ceph_connection *con)
592{
593 void *auth_buf;
594 int auth_len = 0;
595 int auth_protocol = 0;
596
597 mutex_unlock(&con->mutex);
598 if (con->ops->get_authorizer)
599 con->ops->get_authorizer(con, &auth_buf, &auth_len,
600 &auth_protocol, &con->auth_reply_buf,
601 &con->auth_reply_buf_len,
602 con->auth_retry);
603 mutex_lock(&con->mutex);
604
605 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
606 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
607
608 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
609 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
610 con->out_kvec_left++;
611 con->out_kvec_bytes += auth_len;
612}
613
614/*
615 * We connected to a peer and are saying hello.
616 */
617static void prepare_write_banner(struct ceph_messenger *msgr,
618 struct ceph_connection *con)
619{
620 int len = strlen(CEPH_BANNER);
621
622 con->out_kvec[0].iov_base = CEPH_BANNER;
623 con->out_kvec[0].iov_len = len;
624 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
625 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
626 con->out_kvec_left = 2;
627 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
628 con->out_kvec_cur = con->out_kvec;
629 con->out_more = 0;
630 set_bit(WRITE_PENDING, &con->state);
631}
632
633static void prepare_write_connect(struct ceph_messenger *msgr,
634 struct ceph_connection *con,
635 int after_banner)
636{
637 unsigned global_seq = get_global_seq(con->msgr, 0);
638 int proto;
639
640 switch (con->peer_name.type) {
641 case CEPH_ENTITY_TYPE_MON:
642 proto = CEPH_MONC_PROTOCOL;
643 break;
644 case CEPH_ENTITY_TYPE_OSD:
645 proto = CEPH_OSDC_PROTOCOL;
646 break;
647 case CEPH_ENTITY_TYPE_MDS:
648 proto = CEPH_MDSC_PROTOCOL;
649 break;
650 default:
651 BUG();
652 }
653
654 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
655 con->connect_seq, global_seq, proto);
656
657 con->out_connect.features = CEPH_FEATURE_SUPPORTED;
658 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
659 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
660 con->out_connect.global_seq = cpu_to_le32(global_seq);
661 con->out_connect.protocol_version = cpu_to_le32(proto);
662 con->out_connect.flags = 0;
663
664 if (!after_banner) {
665 con->out_kvec_left = 0;
666 con->out_kvec_bytes = 0;
667 }
668 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
669 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
670 con->out_kvec_left++;
671 con->out_kvec_bytes += sizeof(con->out_connect);
672 con->out_kvec_cur = con->out_kvec;
673 con->out_more = 0;
674 set_bit(WRITE_PENDING, &con->state);
675
676 prepare_connect_authorizer(con);
677}
678
679
680/*
681 * write as much of pending kvecs to the socket as we can.
682 * 1 -> done
683 * 0 -> socket full, but more to do
684 * <0 -> error
685 */
686static int write_partial_kvec(struct ceph_connection *con)
687{
688 int ret;
689
690 dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
691 while (con->out_kvec_bytes > 0) {
692 ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
693 con->out_kvec_left, con->out_kvec_bytes,
694 con->out_more);
695 if (ret <= 0)
696 goto out;
697 con->out_kvec_bytes -= ret;
698 if (con->out_kvec_bytes == 0)
699 break; /* done */
700 while (ret > 0) {
701 if (ret >= con->out_kvec_cur->iov_len) {
702 ret -= con->out_kvec_cur->iov_len;
703 con->out_kvec_cur++;
704 con->out_kvec_left--;
705 } else {
706 con->out_kvec_cur->iov_len -= ret;
707 con->out_kvec_cur->iov_base += ret;
708 ret = 0;
709 break;
710 }
711 }
712 }
713 con->out_kvec_left = 0;
714 con->out_kvec_is_msg = false;
715 ret = 1;
716out:
717 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
718 con->out_kvec_bytes, con->out_kvec_left, ret);
719 return ret; /* done! */
720}
721
722/*
723 * Write as much message data payload as we can. If we finish, queue
724 * up the footer.
725 * 1 -> done, footer is now queued in out_kvec[].
726 * 0 -> socket full, but more to do
727 * <0 -> error
728 */
729static int write_partial_msg_pages(struct ceph_connection *con)
730{
731 struct ceph_msg *msg = con->out_msg;
732 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
733 size_t len;
734 int crc = con->msgr->nocrc;
735 int ret;
736
737 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
738 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
739 con->out_msg_pos.page_pos);
740
741 while (con->out_msg_pos.page < con->out_msg->nr_pages) {
742 struct page *page = NULL;
743 void *kaddr = NULL;
744
745 /*
746 * if we are calculating the data crc (the default), we need
747 * to map the page. if our pages[] has been revoked, use the
748 * zero page.
749 */
750 if (msg->pages) {
751 page = msg->pages[con->out_msg_pos.page];
752 if (crc)
753 kaddr = kmap(page);
754 } else if (msg->pagelist) {
755 page = list_first_entry(&msg->pagelist->head,
756 struct page, lru);
757 if (crc)
758 kaddr = kmap(page);
759 } else {
760 page = con->msgr->zero_page;
761 if (crc)
762 kaddr = page_address(con->msgr->zero_page);
763 }
764 len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
765 (int)(data_len - con->out_msg_pos.data_pos));
766 if (crc && !con->out_msg_pos.did_page_crc) {
767 void *base = kaddr + con->out_msg_pos.page_pos;
768 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
769
770 BUG_ON(kaddr == NULL);
771 con->out_msg->footer.data_crc =
772 cpu_to_le32(crc32c(tmpcrc, base, len));
773 con->out_msg_pos.did_page_crc = 1;
774 }
775
776 ret = kernel_sendpage(con->sock, page,
777 con->out_msg_pos.page_pos, len,
778 MSG_DONTWAIT | MSG_NOSIGNAL |
779 MSG_MORE);
780
781 if (crc && (msg->pages || msg->pagelist))
782 kunmap(page);
783
784 if (ret <= 0)
785 goto out;
786
787 con->out_msg_pos.data_pos += ret;
788 con->out_msg_pos.page_pos += ret;
789 if (ret == len) {
790 con->out_msg_pos.page_pos = 0;
791 con->out_msg_pos.page++;
792 con->out_msg_pos.did_page_crc = 0;
793 if (msg->pagelist)
794 list_move_tail(&page->lru,
795 &msg->pagelist->head);
796 }
797 }
798
799 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
800
801 /* prepare and queue up footer, too */
802 if (!crc)
803 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
804 con->out_kvec_bytes = 0;
805 con->out_kvec_left = 0;
806 con->out_kvec_cur = con->out_kvec;
807 prepare_write_message_footer(con, 0);
808 ret = 1;
809out:
810 return ret;
811}
812
813/*
814 * write some zeros
815 */
816static int write_partial_skip(struct ceph_connection *con)
817{
818 int ret;
819
820 while (con->out_skip > 0) {
821 struct kvec iov = {
822 .iov_base = page_address(con->msgr->zero_page),
823 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
824 };
825
826 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
827 if (ret <= 0)
828 goto out;
829 con->out_skip -= ret;
830 }
831 ret = 1;
832out:
833 return ret;
834}
835
836/*
837 * Prepare to read connection handshake, or an ack.
838 */
839static void prepare_read_banner(struct ceph_connection *con)
840{
841 dout("prepare_read_banner %p\n", con);
842 con->in_base_pos = 0;
843}
844
845static void prepare_read_connect(struct ceph_connection *con)
846{
847 dout("prepare_read_connect %p\n", con);
848 con->in_base_pos = 0;
849}
850
851static void prepare_read_ack(struct ceph_connection *con)
852{
853 dout("prepare_read_ack %p\n", con);
854 con->in_base_pos = 0;
855}
856
857static void prepare_read_tag(struct ceph_connection *con)
858{
859 dout("prepare_read_tag %p\n", con);
860 con->in_base_pos = 0;
861 con->in_tag = CEPH_MSGR_TAG_READY;
862}
863
864/*
865 * Prepare to read a message.
866 */
867static int prepare_read_message(struct ceph_connection *con)
868{
869 dout("prepare_read_message %p\n", con);
870 BUG_ON(con->in_msg != NULL);
871 con->in_base_pos = 0;
872 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
873 return 0;
874}
875
876
877static int read_partial(struct ceph_connection *con,
878 int *to, int size, void *object)
879{
880 *to += size;
881 while (con->in_base_pos < *to) {
882 int left = *to - con->in_base_pos;
883 int have = size - left;
884 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
885 if (ret <= 0)
886 return ret;
887 con->in_base_pos += ret;
888 }
889 return 1;
890}
891
892
893/*
894 * Read all or part of the connect-side handshake on a new connection
895 */
896static int read_partial_banner(struct ceph_connection *con)
897{
898 int ret, to = 0;
899
900 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
901
902 /* peer's banner */
903 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
904 if (ret <= 0)
905 goto out;
906 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
907 &con->actual_peer_addr);
908 if (ret <= 0)
909 goto out;
910 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
911 &con->peer_addr_for_me);
912 if (ret <= 0)
913 goto out;
914out:
915 return ret;
916}
917
918static int read_partial_connect(struct ceph_connection *con)
919{
920 int ret, to = 0;
921
922 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
923
924 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
925 if (ret <= 0)
926 goto out;
927 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
928 con->auth_reply_buf);
929 if (ret <= 0)
930 goto out;
931
932 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
933 con, (int)con->in_reply.tag,
934 le32_to_cpu(con->in_reply.connect_seq),
935 le32_to_cpu(con->in_reply.global_seq));
936out:
937 return ret;
938
939}
940
941/*
942 * Verify the hello banner looks okay.
943 */
944static int verify_hello(struct ceph_connection *con)
945{
946 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
947 pr_err("connect to %s got bad banner\n",
948 pr_addr(&con->peer_addr.in_addr));
949 con->error_msg = "protocol error, bad banner";
950 return -1;
951 }
952 return 0;
953}
954
955static bool addr_is_blank(struct sockaddr_storage *ss)
956{
957 switch (ss->ss_family) {
958 case AF_INET:
959 return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
960 case AF_INET6:
961 return
962 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
963 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
964 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
965 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
966 }
967 return false;
968}
969
970static int addr_port(struct sockaddr_storage *ss)
971{
972 switch (ss->ss_family) {
973 case AF_INET:
974 return ntohs(((struct sockaddr_in *)ss)->sin_port);
975 case AF_INET6:
976 return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
977 }
978 return 0;
979}
980
981static void addr_set_port(struct sockaddr_storage *ss, int p)
982{
983 switch (ss->ss_family) {
984 case AF_INET:
985 ((struct sockaddr_in *)ss)->sin_port = htons(p);
986 case AF_INET6:
987 ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
988 }
989}
990
991/*
992 * Parse an ip[:port] list into an addr array. Use the default
993 * monitor port if a port isn't specified.
994 */
995int ceph_parse_ips(const char *c, const char *end,
996 struct ceph_entity_addr *addr,
997 int max_count, int *count)
998{
999 int i;
1000 const char *p = c;
1001
1002 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
1003 for (i = 0; i < max_count; i++) {
1004 const char *ipend;
1005 struct sockaddr_storage *ss = &addr[i].in_addr;
1006 struct sockaddr_in *in4 = (void *)ss;
1007 struct sockaddr_in6 *in6 = (void *)ss;
1008 int port;
1009
1010 memset(ss, 0, sizeof(*ss));
1011 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1012 ',', &ipend)) {
1013 ss->ss_family = AF_INET;
1014 } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1015 ',', &ipend)) {
1016 ss->ss_family = AF_INET6;
1017 } else {
1018 goto bad;
1019 }
1020 p = ipend;
1021
1022 /* port? */
1023 if (p < end && *p == ':') {
1024 port = 0;
1025 p++;
1026 while (p < end && *p >= '0' && *p <= '9') {
1027 port = (port * 10) + (*p - '0');
1028 p++;
1029 }
1030 if (port > 65535 || port == 0)
1031 goto bad;
1032 } else {
1033 port = CEPH_MON_PORT;
1034 }
1035
1036 addr_set_port(ss, port);
1037
1038 dout("parse_ips got %s\n", pr_addr(ss));
1039
1040 if (p == end)
1041 break;
1042 if (*p != ',')
1043 goto bad;
1044 p++;
1045 }
1046
1047 if (p != end)
1048 goto bad;
1049
1050 if (count)
1051 *count = i + 1;
1052 return 0;
1053
1054bad:
1055 pr_err("parse_ips bad ip '%s'\n", c);
1056 return -EINVAL;
1057}
1058
1059static int process_banner(struct ceph_connection *con)
1060{
1061 dout("process_banner on %p\n", con);
1062
1063 if (verify_hello(con) < 0)
1064 return -1;
1065
1066 ceph_decode_addr(&con->actual_peer_addr);
1067 ceph_decode_addr(&con->peer_addr_for_me);
1068
1069 /*
1070 * Make sure the other end is who we wanted. note that the other
1071 * end may not yet know their ip address, so if it's 0.0.0.0, give
1072 * them the benefit of the doubt.
1073 */
1074 if (memcmp(&con->peer_addr, &con->actual_peer_addr,
1075 sizeof(con->peer_addr)) != 0 &&
1076 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1077 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1078 pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
1079 pr_addr(&con->peer_addr.in_addr),
1080 le64_to_cpu(con->peer_addr.nonce),
1081 pr_addr(&con->actual_peer_addr.in_addr),
1082 le64_to_cpu(con->actual_peer_addr.nonce));
1083 con->error_msg = "wrong peer at address";
1084 return -1;
1085 }
1086
1087 /*
1088 * did we learn our address?
1089 */
1090 if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
1091 int port = addr_port(&con->msgr->inst.addr.in_addr);
1092
1093 memcpy(&con->msgr->inst.addr.in_addr,
1094 &con->peer_addr_for_me.in_addr,
1095 sizeof(con->peer_addr_for_me.in_addr));
1096 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1097 encode_my_addr(con->msgr);
1098 dout("process_banner learned my addr is %s\n",
1099 pr_addr(&con->msgr->inst.addr.in_addr));
1100 }
1101
1102 set_bit(NEGOTIATING, &con->state);
1103 prepare_read_connect(con);
1104 return 0;
1105}
1106
1107static void fail_protocol(struct ceph_connection *con)
1108{
1109 reset_connection(con);
1110 set_bit(CLOSED, &con->state); /* in case there's queued work */
1111
1112 mutex_unlock(&con->mutex);
1113 if (con->ops->bad_proto)
1114 con->ops->bad_proto(con);
1115 mutex_lock(&con->mutex);
1116}
1117
1118static int process_connect(struct ceph_connection *con)
1119{
1120 u64 sup_feat = CEPH_FEATURE_SUPPORTED;
1121 u64 req_feat = CEPH_FEATURE_REQUIRED;
1122 u64 server_feat = le64_to_cpu(con->in_reply.features);
1123
1124 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
1125
1126 switch (con->in_reply.tag) {
1127 case CEPH_MSGR_TAG_FEATURES:
1128 pr_err("%s%lld %s feature set mismatch,"
1129 " my %llx < server's %llx, missing %llx\n",
1130 ENTITY_NAME(con->peer_name),
1131 pr_addr(&con->peer_addr.in_addr),
1132 sup_feat, server_feat, server_feat & ~sup_feat);
1133 con->error_msg = "missing required protocol features";
1134 fail_protocol(con);
1135 return -1;
1136
1137 case CEPH_MSGR_TAG_BADPROTOVER:
1138 pr_err("%s%lld %s protocol version mismatch,"
1139 " my %d != server's %d\n",
1140 ENTITY_NAME(con->peer_name),
1141 pr_addr(&con->peer_addr.in_addr),
1142 le32_to_cpu(con->out_connect.protocol_version),
1143 le32_to_cpu(con->in_reply.protocol_version));
1144 con->error_msg = "protocol version mismatch";
1145 fail_protocol(con);
1146 return -1;
1147
1148 case CEPH_MSGR_TAG_BADAUTHORIZER:
1149 con->auth_retry++;
1150 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
1151 con->auth_retry);
1152 if (con->auth_retry == 2) {
1153 con->error_msg = "connect authorization failure";
1154 reset_connection(con);
1155 set_bit(CLOSED, &con->state);
1156 return -1;
1157 }
1158 con->auth_retry = 1;
1159 prepare_write_connect(con->msgr, con, 0);
1160 prepare_read_connect(con);
1161 break;
1162
1163 case CEPH_MSGR_TAG_RESETSESSION:
1164 /*
1165 * If we connected with a large connect_seq but the peer
1166 * has no record of a session with us (no connection, or
1167 * connect_seq == 0), they will send RESETSESION to indicate
1168 * that they must have reset their session, and may have
1169 * dropped messages.
1170 */
1171 dout("process_connect got RESET peer seq %u\n",
1172 le32_to_cpu(con->in_connect.connect_seq));
1173 pr_err("%s%lld %s connection reset\n",
1174 ENTITY_NAME(con->peer_name),
1175 pr_addr(&con->peer_addr.in_addr));
1176 reset_connection(con);
1177 prepare_write_connect(con->msgr, con, 0);
1178 prepare_read_connect(con);
1179
1180 /* Tell ceph about it. */
1181 mutex_unlock(&con->mutex);
1182 pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
1183 if (con->ops->peer_reset)
1184 con->ops->peer_reset(con);
1185 mutex_lock(&con->mutex);
1186 break;
1187
1188 case CEPH_MSGR_TAG_RETRY_SESSION:
1189 /*
1190 * If we sent a smaller connect_seq than the peer has, try
1191 * again with a larger value.
1192 */
1193 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1194 le32_to_cpu(con->out_connect.connect_seq),
1195 le32_to_cpu(con->in_connect.connect_seq));
1196 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1197 prepare_write_connect(con->msgr, con, 0);
1198 prepare_read_connect(con);
1199 break;
1200
1201 case CEPH_MSGR_TAG_RETRY_GLOBAL:
1202 /*
1203 * If we sent a smaller global_seq than the peer has, try
1204 * again with a larger value.
1205 */
1206 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1207 con->peer_global_seq,
1208 le32_to_cpu(con->in_connect.global_seq));
1209 get_global_seq(con->msgr,
1210 le32_to_cpu(con->in_connect.global_seq));
1211 prepare_write_connect(con->msgr, con, 0);
1212 prepare_read_connect(con);
1213 break;
1214
1215 case CEPH_MSGR_TAG_READY:
1216 if (req_feat & ~server_feat) {
1217 pr_err("%s%lld %s protocol feature mismatch,"
1218 " my required %llx > server's %llx, need %llx\n",
1219 ENTITY_NAME(con->peer_name),
1220 pr_addr(&con->peer_addr.in_addr),
1221 req_feat, server_feat, req_feat & ~server_feat);
1222 con->error_msg = "missing required protocol features";
1223 fail_protocol(con);
1224 return -1;
1225 }
1226 clear_bit(CONNECTING, &con->state);
1227 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1228 con->connect_seq++;
1229 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1230 con->peer_global_seq,
1231 le32_to_cpu(con->in_reply.connect_seq),
1232 con->connect_seq);
1233 WARN_ON(con->connect_seq !=
1234 le32_to_cpu(con->in_reply.connect_seq));
1235
1236 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1237 set_bit(LOSSYTX, &con->state);
1238
1239 prepare_read_tag(con);
1240 break;
1241
1242 case CEPH_MSGR_TAG_WAIT:
1243 /*
1244 * If there is a connection race (we are opening
1245 * connections to each other), one of us may just have
1246 * to WAIT. This shouldn't happen if we are the
1247 * client.
1248 */
1249 pr_err("process_connect peer connecting WAIT\n");
1250
1251 default:
1252 pr_err("connect protocol error, will retry\n");
1253 con->error_msg = "protocol error, garbage tag during connect";
1254 return -1;
1255 }
1256 return 0;
1257}
1258
1259
1260/*
1261 * read (part of) an ack
1262 */
1263static int read_partial_ack(struct ceph_connection *con)
1264{
1265 int to = 0;
1266
1267 return read_partial(con, &to, sizeof(con->in_temp_ack),
1268 &con->in_temp_ack);
1269}
1270
1271
1272/*
1273 * We can finally discard anything that's been acked.
1274 */
1275static void process_ack(struct ceph_connection *con)
1276{
1277 struct ceph_msg *m;
1278 u64 ack = le64_to_cpu(con->in_temp_ack);
1279 u64 seq;
1280
1281 while (!list_empty(&con->out_sent)) {
1282 m = list_first_entry(&con->out_sent, struct ceph_msg,
1283 list_head);
1284 seq = le64_to_cpu(m->hdr.seq);
1285 if (seq > ack)
1286 break;
1287 dout("got ack for seq %llu type %d at %p\n", seq,
1288 le16_to_cpu(m->hdr.type), m);
1289 ceph_msg_remove(m);
1290 }
1291 prepare_read_tag(con);
1292}
1293
1294
1295
1296
1297static int read_partial_message_section(struct ceph_connection *con,
1298 struct kvec *section, unsigned int sec_len,
1299 u32 *crc)
1300{
1301 int left;
1302 int ret;
1303
1304 BUG_ON(!section);
1305
1306 while (section->iov_len < sec_len) {
1307 BUG_ON(section->iov_base == NULL);
1308 left = sec_len - section->iov_len;
1309 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
1310 section->iov_len, left);
1311 if (ret <= 0)
1312 return ret;
1313 section->iov_len += ret;
1314 if (section->iov_len == sec_len)
1315 *crc = crc32c(0, section->iov_base,
1316 section->iov_len);
1317 }
1318
1319 return 1;
1320}
1321
1322static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1323 struct ceph_msg_header *hdr,
1324 int *skip);
1325/*
1326 * read (part of) a message.
1327 */
1328static int read_partial_message(struct ceph_connection *con)
1329{
1330 struct ceph_msg *m = con->in_msg;
1331 void *p;
1332 int ret;
1333 int to, left;
1334 unsigned front_len, middle_len, data_len, data_off;
1335 int datacrc = con->msgr->nocrc;
1336 int skip;
1337
1338 dout("read_partial_message con %p msg %p\n", con, m);
1339
1340 /* header */
1341 while (con->in_base_pos < sizeof(con->in_hdr)) {
1342 left = sizeof(con->in_hdr) - con->in_base_pos;
1343 ret = ceph_tcp_recvmsg(con->sock,
1344 (char *)&con->in_hdr + con->in_base_pos,
1345 left);
1346 if (ret <= 0)
1347 return ret;
1348 con->in_base_pos += ret;
1349 if (con->in_base_pos == sizeof(con->in_hdr)) {
1350 u32 crc = crc32c(0, (void *)&con->in_hdr,
1351 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1352 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1353 pr_err("read_partial_message bad hdr "
1354 " crc %u != expected %u\n",
1355 crc, con->in_hdr.crc);
1356 return -EBADMSG;
1357 }
1358 }
1359 }
1360 front_len = le32_to_cpu(con->in_hdr.front_len);
1361 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1362 return -EIO;
1363 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1364 if (middle_len > CEPH_MSG_MAX_DATA_LEN)
1365 return -EIO;
1366 data_len = le32_to_cpu(con->in_hdr.data_len);
1367 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1368 return -EIO;
1369 data_off = le16_to_cpu(con->in_hdr.data_off);
1370
1371 /* allocate message? */
1372 if (!con->in_msg) {
1373 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1374 con->in_hdr.front_len, con->in_hdr.data_len);
1375 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1376 if (skip) {
1377 /* skip this message */
1378 dout("alloc_msg returned NULL, skipping message\n");
1379 con->in_base_pos = -front_len - middle_len - data_len -
1380 sizeof(m->footer);
1381 con->in_tag = CEPH_MSGR_TAG_READY;
1382 return 0;
1383 }
1384 if (IS_ERR(con->in_msg)) {
1385 ret = PTR_ERR(con->in_msg);
1386 con->in_msg = NULL;
1387 con->error_msg =
1388 "error allocating memory for incoming message";
1389 return ret;
1390 }
1391 m = con->in_msg;
1392 m->front.iov_len = 0; /* haven't read it yet */
1393 if (m->middle)
1394 m->middle->vec.iov_len = 0;
1395
1396 con->in_msg_pos.page = 0;
1397 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1398 con->in_msg_pos.data_pos = 0;
1399 }
1400
1401 /* front */
1402 ret = read_partial_message_section(con, &m->front, front_len,
1403 &con->in_front_crc);
1404 if (ret <= 0)
1405 return ret;
1406
1407 /* middle */
1408 if (m->middle) {
1409 ret = read_partial_message_section(con, &m->middle->vec, middle_len,
1410 &con->in_middle_crc);
1411 if (ret <= 0)
1412 return ret;
1413 }
1414
1415 /* (page) data */
1416 while (con->in_msg_pos.data_pos < data_len) {
1417 left = min((int)(data_len - con->in_msg_pos.data_pos),
1418 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1419 BUG_ON(m->pages == NULL);
1420 p = kmap(m->pages[con->in_msg_pos.page]);
1421 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1422 left);
1423 if (ret > 0 && datacrc)
1424 con->in_data_crc =
1425 crc32c(con->in_data_crc,
1426 p + con->in_msg_pos.page_pos, ret);
1427 kunmap(m->pages[con->in_msg_pos.page]);
1428 if (ret <= 0)
1429 return ret;
1430 con->in_msg_pos.data_pos += ret;
1431 con->in_msg_pos.page_pos += ret;
1432 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1433 con->in_msg_pos.page_pos = 0;
1434 con->in_msg_pos.page++;
1435 }
1436 }
1437
1438 /* footer */
1439 to = sizeof(m->hdr) + sizeof(m->footer);
1440 while (con->in_base_pos < to) {
1441 left = to - con->in_base_pos;
1442 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1443 (con->in_base_pos - sizeof(m->hdr)),
1444 left);
1445 if (ret <= 0)
1446 return ret;
1447 con->in_base_pos += ret;
1448 }
1449 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1450 m, front_len, m->footer.front_crc, middle_len,
1451 m->footer.middle_crc, data_len, m->footer.data_crc);
1452
1453 /* crc ok? */
1454 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1455 pr_err("read_partial_message %p front crc %u != exp. %u\n",
1456 m, con->in_front_crc, m->footer.front_crc);
1457 return -EBADMSG;
1458 }
1459 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1460 pr_err("read_partial_message %p middle crc %u != exp %u\n",
1461 m, con->in_middle_crc, m->footer.middle_crc);
1462 return -EBADMSG;
1463 }
1464 if (datacrc &&
1465 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1466 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1467 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1468 con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1469 return -EBADMSG;
1470 }
1471
1472 return 1; /* done! */
1473}
1474
1475/*
1476 * Process message. This happens in the worker thread. The callback should
1477 * be careful not to do anything that waits on other incoming messages or it
1478 * may deadlock.
1479 */
1480static void process_message(struct ceph_connection *con)
1481{
1482 struct ceph_msg *msg;
1483
1484 msg = con->in_msg;
1485 con->in_msg = NULL;
1486
1487 /* if first message, set peer_name */
1488 if (con->peer_name.type == 0)
1489 con->peer_name = msg->hdr.src.name;
1490
1491 con->in_seq++;
1492 mutex_unlock(&con->mutex);
1493
1494 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1495 msg, le64_to_cpu(msg->hdr.seq),
1496 ENTITY_NAME(msg->hdr.src.name),
1497 le16_to_cpu(msg->hdr.type),
1498 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1499 le32_to_cpu(msg->hdr.front_len),
1500 le32_to_cpu(msg->hdr.data_len),
1501 con->in_front_crc, con->in_middle_crc, con->in_data_crc);
1502 con->ops->dispatch(con, msg);
1503
1504 mutex_lock(&con->mutex);
1505 prepare_read_tag(con);
1506}
1507
1508
1509/*
1510 * Write something to the socket. Called in a worker thread when the
1511 * socket appears to be writeable and we have something ready to send.
1512 */
1513static int try_write(struct ceph_connection *con)
1514{
1515 struct ceph_messenger *msgr = con->msgr;
1516 int ret = 1;
1517
1518 dout("try_write start %p state %lu nref %d\n", con, con->state,
1519 atomic_read(&con->nref));
1520
1521 mutex_lock(&con->mutex);
1522more:
1523 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1524
1525 /* open the socket first? */
1526 if (con->sock == NULL) {
1527 /*
1528 * if we were STANDBY and are reconnecting _this_
1529 * connection, bump connect_seq now. Always bump
1530 * global_seq.
1531 */
1532 if (test_and_clear_bit(STANDBY, &con->state))
1533 con->connect_seq++;
1534
1535 prepare_write_banner(msgr, con);
1536 prepare_write_connect(msgr, con, 1);
1537 prepare_read_banner(con);
1538 set_bit(CONNECTING, &con->state);
1539 clear_bit(NEGOTIATING, &con->state);
1540
1541 BUG_ON(con->in_msg);
1542 con->in_tag = CEPH_MSGR_TAG_READY;
1543 dout("try_write initiating connect on %p new state %lu\n",
1544 con, con->state);
1545 con->sock = ceph_tcp_connect(con);
1546 if (IS_ERR(con->sock)) {
1547 con->sock = NULL;
1548 con->error_msg = "connect error";
1549 ret = -1;
1550 goto out;
1551 }
1552 }
1553
1554more_kvec:
1555 /* kvec data queued? */
1556 if (con->out_skip) {
1557 ret = write_partial_skip(con);
1558 if (ret <= 0)
1559 goto done;
1560 if (ret < 0) {
1561 dout("try_write write_partial_skip err %d\n", ret);
1562 goto done;
1563 }
1564 }
1565 if (con->out_kvec_left) {
1566 ret = write_partial_kvec(con);
1567 if (ret <= 0)
1568 goto done;
1569 }
1570
1571 /* msg pages? */
1572 if (con->out_msg) {
1573 if (con->out_msg_done) {
1574 ceph_msg_put(con->out_msg);
1575 con->out_msg = NULL; /* we're done with this one */
1576 goto do_next;
1577 }
1578
1579 ret = write_partial_msg_pages(con);
1580 if (ret == 1)
1581 goto more_kvec; /* we need to send the footer, too! */
1582 if (ret == 0)
1583 goto done;
1584 if (ret < 0) {
1585 dout("try_write write_partial_msg_pages err %d\n",
1586 ret);
1587 goto done;
1588 }
1589 }
1590
1591do_next:
1592 if (!test_bit(CONNECTING, &con->state)) {
1593 /* is anything else pending? */
1594 if (!list_empty(&con->out_queue)) {
1595 prepare_write_message(con);
1596 goto more;
1597 }
1598 if (con->in_seq > con->in_seq_acked) {
1599 prepare_write_ack(con);
1600 goto more;
1601 }
1602 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
1603 prepare_write_keepalive(con);
1604 goto more;
1605 }
1606 }
1607
1608 /* Nothing to do! */
1609 clear_bit(WRITE_PENDING, &con->state);
1610 dout("try_write nothing else to write.\n");
1611done:
1612 ret = 0;
1613out:
1614 mutex_unlock(&con->mutex);
1615 dout("try_write done on %p\n", con);
1616 return ret;
1617}
1618
1619
1620
1621/*
1622 * Read what we can from the socket.
1623 */
1624static int try_read(struct ceph_connection *con)
1625{
1626 struct ceph_messenger *msgr;
1627 int ret = -1;
1628
1629 if (!con->sock)
1630 return 0;
1631
1632 if (test_bit(STANDBY, &con->state))
1633 return 0;
1634
1635 dout("try_read start on %p\n", con);
1636 msgr = con->msgr;
1637
1638 mutex_lock(&con->mutex);
1639
1640more:
1641 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1642 con->in_base_pos);
1643 if (test_bit(CONNECTING, &con->state)) {
1644 if (!test_bit(NEGOTIATING, &con->state)) {
1645 dout("try_read connecting\n");
1646 ret = read_partial_banner(con);
1647 if (ret <= 0)
1648 goto done;
1649 if (process_banner(con) < 0) {
1650 ret = -1;
1651 goto out;
1652 }
1653 }
1654 ret = read_partial_connect(con);
1655 if (ret <= 0)
1656 goto done;
1657 if (process_connect(con) < 0) {
1658 ret = -1;
1659 goto out;
1660 }
1661 goto more;
1662 }
1663
1664 if (con->in_base_pos < 0) {
1665 /*
1666 * skipping + discarding content.
1667 *
1668 * FIXME: there must be a better way to do this!
1669 */
1670 static char buf[1024];
1671 int skip = min(1024, -con->in_base_pos);
1672 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1673 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1674 if (ret <= 0)
1675 goto done;
1676 con->in_base_pos += ret;
1677 if (con->in_base_pos)
1678 goto more;
1679 }
1680 if (con->in_tag == CEPH_MSGR_TAG_READY) {
1681 /*
1682 * what's next?
1683 */
1684 ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
1685 if (ret <= 0)
1686 goto done;
1687 dout("try_read got tag %d\n", (int)con->in_tag);
1688 switch (con->in_tag) {
1689 case CEPH_MSGR_TAG_MSG:
1690 prepare_read_message(con);
1691 break;
1692 case CEPH_MSGR_TAG_ACK:
1693 prepare_read_ack(con);
1694 break;
1695 case CEPH_MSGR_TAG_CLOSE:
1696 set_bit(CLOSED, &con->state); /* fixme */
1697 goto done;
1698 default:
1699 goto bad_tag;
1700 }
1701 }
1702 if (con->in_tag == CEPH_MSGR_TAG_MSG) {
1703 ret = read_partial_message(con);
1704 if (ret <= 0) {
1705 switch (ret) {
1706 case -EBADMSG:
1707 con->error_msg = "bad crc";
1708 ret = -EIO;
1709 goto out;
1710 case -EIO:
1711 con->error_msg = "io error";
1712 goto out;
1713 default:
1714 goto done;
1715 }
1716 }
1717 if (con->in_tag == CEPH_MSGR_TAG_READY)
1718 goto more;
1719 process_message(con);
1720 goto more;
1721 }
1722 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
1723 ret = read_partial_ack(con);
1724 if (ret <= 0)
1725 goto done;
1726 process_ack(con);
1727 goto more;
1728 }
1729
1730done:
1731 ret = 0;
1732out:
1733 mutex_unlock(&con->mutex);
1734 dout("try_read done on %p\n", con);
1735 return ret;
1736
1737bad_tag:
1738 pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
1739 con->error_msg = "protocol error, garbage tag";
1740 ret = -1;
1741 goto out;
1742}
1743
1744
1745/*
1746 * Atomically queue work on a connection. Bump @con reference to
1747 * avoid races with connection teardown.
1748 *
1749 * There is some trickery going on with QUEUED and BUSY because we
1750 * only want a _single_ thread operating on each connection at any
1751 * point in time, but we want to use all available CPUs.
1752 *
1753 * The worker thread only proceeds if it can atomically set BUSY. It
1754 * clears QUEUED and does it's thing. When it thinks it's done, it
1755 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1756 * (tries again to set BUSY).
1757 *
1758 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1759 * try to queue work. If that fails (work is already queued, or BUSY)
1760 * we give up (work also already being done or is queued) but leave QUEUED
1761 * set so that the worker thread will loop if necessary.
1762 */
1763static void queue_con(struct ceph_connection *con)
1764{
1765 if (test_bit(DEAD, &con->state)) {
1766 dout("queue_con %p ignoring: DEAD\n",
1767 con);
1768 return;
1769 }
1770
1771 if (!con->ops->get(con)) {
1772 dout("queue_con %p ref count 0\n", con);
1773 return;
1774 }
1775
1776 set_bit(QUEUED, &con->state);
1777 if (test_bit(BUSY, &con->state)) {
1778 dout("queue_con %p - already BUSY\n", con);
1779 con->ops->put(con);
1780 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1781 dout("queue_con %p - already queued\n", con);
1782 con->ops->put(con);
1783 } else {
1784 dout("queue_con %p\n", con);
1785 }
1786}
1787
1788/*
1789 * Do some work on a connection. Drop a connection ref when we're done.
1790 */
1791static void con_work(struct work_struct *work)
1792{
1793 struct ceph_connection *con = container_of(work, struct ceph_connection,
1794 work.work);
1795 int backoff = 0;
1796
1797more:
1798 if (test_and_set_bit(BUSY, &con->state) != 0) {
1799 dout("con_work %p BUSY already set\n", con);
1800 goto out;
1801 }
1802 dout("con_work %p start, clearing QUEUED\n", con);
1803 clear_bit(QUEUED, &con->state);
1804
1805 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1806 dout("con_work CLOSED\n");
1807 con_close_socket(con);
1808 goto done;
1809 }
1810 if (test_and_clear_bit(OPENING, &con->state)) {
1811 /* reopen w/ new peer */
1812 dout("con_work OPENING\n");
1813 con_close_socket(con);
1814 }
1815
1816 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1817 try_read(con) < 0 ||
1818 try_write(con) < 0) {
1819 backoff = 1;
1820 ceph_fault(con); /* error/fault path */
1821 }
1822
1823done:
1824 clear_bit(BUSY, &con->state);
1825 dout("con->state=%lu\n", con->state);
1826 if (test_bit(QUEUED, &con->state)) {
1827 if (!backoff || test_bit(OPENING, &con->state)) {
1828 dout("con_work %p QUEUED reset, looping\n", con);
1829 goto more;
1830 }
1831 dout("con_work %p QUEUED reset, but just faulted\n", con);
1832 clear_bit(QUEUED, &con->state);
1833 }
1834 dout("con_work %p done\n", con);
1835
1836out:
1837 con->ops->put(con);
1838}
1839
1840
1841/*
1842 * Generic error/fault handler. A retry mechanism is used with
1843 * exponential backoff
1844 */
1845static void ceph_fault(struct ceph_connection *con)
1846{
1847 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
1848 pr_addr(&con->peer_addr.in_addr), con->error_msg);
1849 dout("fault %p state %lu to peer %s\n",
1850 con, con->state, pr_addr(&con->peer_addr.in_addr));
1851
1852 if (test_bit(LOSSYTX, &con->state)) {
1853 dout("fault on LOSSYTX channel\n");
1854 goto out;
1855 }
1856
1857 mutex_lock(&con->mutex);
1858 if (test_bit(CLOSED, &con->state))
1859 goto out_unlock;
1860
1861 con_close_socket(con);
1862
1863 if (con->in_msg) {
1864 ceph_msg_put(con->in_msg);
1865 con->in_msg = NULL;
1866 }
1867
1868 /* Requeue anything that hasn't been acked */
1869 list_splice_init(&con->out_sent, &con->out_queue);
1870
1871 /* If there are no messages in the queue, place the connection
1872 * in a STANDBY state (i.e., don't try to reconnect just yet). */
1873 if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
1874 dout("fault setting STANDBY\n");
1875 set_bit(STANDBY, &con->state);
1876 } else {
1877 /* retry after a delay. */
1878 if (con->delay == 0)
1879 con->delay = BASE_DELAY_INTERVAL;
1880 else if (con->delay < MAX_DELAY_INTERVAL)
1881 con->delay *= 2;
1882 dout("fault queueing %p delay %lu\n", con, con->delay);
1883 con->ops->get(con);
1884 if (queue_delayed_work(ceph_msgr_wq, &con->work,
1885 round_jiffies_relative(con->delay)) == 0)
1886 con->ops->put(con);
1887 }
1888
1889out_unlock:
1890 mutex_unlock(&con->mutex);
1891out:
1892 /*
1893 * in case we faulted due to authentication, invalidate our
1894 * current tickets so that we can get new ones.
1895 */
1896 if (con->auth_retry && con->ops->invalidate_authorizer) {
1897 dout("calling invalidate_authorizer()\n");
1898 con->ops->invalidate_authorizer(con);
1899 }
1900
1901 if (con->ops->fault)
1902 con->ops->fault(con);
1903}
1904
1905
1906
1907/*
1908 * create a new messenger instance
1909 */
1910struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1911{
1912 struct ceph_messenger *msgr;
1913
1914 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
1915 if (msgr == NULL)
1916 return ERR_PTR(-ENOMEM);
1917
1918 spin_lock_init(&msgr->global_seq_lock);
1919
1920 /* the zero page is needed if a request is "canceled" while the message
1921 * is being written over the socket */
1922 msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1923 if (!msgr->zero_page) {
1924 kfree(msgr);
1925 return ERR_PTR(-ENOMEM);
1926 }
1927 kmap(msgr->zero_page);
1928
1929 if (myaddr)
1930 msgr->inst.addr = *myaddr;
1931
1932 /* select a random nonce */
1933 msgr->inst.addr.type = 0;
1934 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
1935 encode_my_addr(msgr);
1936
1937 dout("messenger_create %p\n", msgr);
1938 return msgr;
1939}
1940
1941void ceph_messenger_destroy(struct ceph_messenger *msgr)
1942{
1943 dout("destroy %p\n", msgr);
1944 kunmap(msgr->zero_page);
1945 __free_page(msgr->zero_page);
1946 kfree(msgr);
1947 dout("destroyed messenger %p\n", msgr);
1948}
1949
1950/*
1951 * Queue up an outgoing message on the given connection.
1952 */
1953void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1954{
1955 if (test_bit(CLOSED, &con->state)) {
1956 dout("con_send %p closed, dropping %p\n", con, msg);
1957 ceph_msg_put(msg);
1958 return;
1959 }
1960
1961 /* set src+dst */
1962 msg->hdr.src.name = con->msgr->inst.name;
1963 msg->hdr.src.addr = con->msgr->my_enc_addr;
1964 msg->hdr.orig_src = msg->hdr.src;
1965
1966 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1967
1968 /* queue */
1969 mutex_lock(&con->mutex);
1970 BUG_ON(!list_empty(&msg->list_head));
1971 list_add_tail(&msg->list_head, &con->out_queue);
1972 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
1973 ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
1974 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1975 le32_to_cpu(msg->hdr.front_len),
1976 le32_to_cpu(msg->hdr.middle_len),
1977 le32_to_cpu(msg->hdr.data_len));
1978 mutex_unlock(&con->mutex);
1979
1980 /* if there wasn't anything waiting to send before, queue
1981 * new work */
1982 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
1983 queue_con(con);
1984}
1985
1986/*
1987 * Revoke a message that was previously queued for send
1988 */
1989void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
1990{
1991 mutex_lock(&con->mutex);
1992 if (!list_empty(&msg->list_head)) {
1993 dout("con_revoke %p msg %p\n", con, msg);
1994 list_del_init(&msg->list_head);
1995 ceph_msg_put(msg);
1996 msg->hdr.seq = 0;
1997 if (con->out_msg == msg) {
1998 ceph_msg_put(con->out_msg);
1999 con->out_msg = NULL;
2000 }
2001 if (con->out_kvec_is_msg) {
2002 con->out_skip = con->out_kvec_bytes;
2003 con->out_kvec_is_msg = false;
2004 }
2005 } else {
2006 dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
2007 }
2008 mutex_unlock(&con->mutex);
2009}
2010
2011/*
2012 * Revoke a message that we may be reading data into
2013 */
2014void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2015{
2016 mutex_lock(&con->mutex);
2017 if (con->in_msg && con->in_msg == msg) {
2018 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2019 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2020 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2021
2022 /* skip rest of message */
2023 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2024 con->in_base_pos = con->in_base_pos -
2025 sizeof(struct ceph_msg_header) -
2026 front_len -
2027 middle_len -
2028 data_len -
2029 sizeof(struct ceph_msg_footer);
2030 ceph_msg_put(con->in_msg);
2031 con->in_msg = NULL;
2032 con->in_tag = CEPH_MSGR_TAG_READY;
2033 } else {
2034 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2035 con, con->in_msg, msg);
2036 }
2037 mutex_unlock(&con->mutex);
2038}
2039
2040/*
2041 * Queue a keepalive byte to ensure the tcp connection is alive.
2042 */
2043void ceph_con_keepalive(struct ceph_connection *con)
2044{
2045 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2046 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2047 queue_con(con);
2048}
2049
2050
2051/*
2052 * construct a new message with given type, size
2053 * the new msg has a ref count of 1.
2054 */
2055struct ceph_msg *ceph_msg_new(int type, int front_len,
2056 int page_len, int page_off, struct page **pages)
2057{
2058 struct ceph_msg *m;
2059
2060 m = kmalloc(sizeof(*m), GFP_NOFS);
2061 if (m == NULL)
2062 goto out;
2063 kref_init(&m->kref);
2064 INIT_LIST_HEAD(&m->list_head);
2065
2066 m->hdr.type = cpu_to_le16(type);
2067 m->hdr.front_len = cpu_to_le32(front_len);
2068 m->hdr.middle_len = 0;
2069 m->hdr.data_len = cpu_to_le32(page_len);
2070 m->hdr.data_off = cpu_to_le16(page_off);
2071 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2072 m->footer.front_crc = 0;
2073 m->footer.middle_crc = 0;
2074 m->footer.data_crc = 0;
2075 m->front_max = front_len;
2076 m->front_is_vmalloc = false;
2077 m->more_to_follow = false;
2078 m->pool = NULL;
2079
2080 /* front */
2081 if (front_len) {
2082 if (front_len > PAGE_CACHE_SIZE) {
2083 m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
2084 PAGE_KERNEL);
2085 m->front_is_vmalloc = true;
2086 } else {
2087 m->front.iov_base = kmalloc(front_len, GFP_NOFS);
2088 }
2089 if (m->front.iov_base == NULL) {
2090 pr_err("msg_new can't allocate %d bytes\n",
2091 front_len);
2092 goto out2;
2093 }
2094 } else {
2095 m->front.iov_base = NULL;
2096 }
2097 m->front.iov_len = front_len;
2098
2099 /* middle */
2100 m->middle = NULL;
2101
2102 /* data */
2103 m->nr_pages = calc_pages_for(page_off, page_len);
2104 m->pages = pages;
2105 m->pagelist = NULL;
2106
2107 dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
2108 m->nr_pages);
2109 return m;
2110
2111out2:
2112 ceph_msg_put(m);
2113out:
2114 pr_err("msg_new can't create type %d len %d\n", type, front_len);
2115 return ERR_PTR(-ENOMEM);
2116}
2117
2118/*
2119 * Allocate "middle" portion of a message, if it is needed and wasn't
2120 * allocated by alloc_msg. This allows us to read a small fixed-size
2121 * per-type header in the front and then gracefully fail (i.e.,
2122 * propagate the error to the caller based on info in the front) when
2123 * the middle is too large.
2124 */
2125static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2126{
2127 int type = le16_to_cpu(msg->hdr.type);
2128 int middle_len = le32_to_cpu(msg->hdr.middle_len);
2129
2130 dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
2131 ceph_msg_type_name(type), middle_len);
2132 BUG_ON(!middle_len);
2133 BUG_ON(msg->middle);
2134
2135 msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
2136 if (!msg->middle)
2137 return -ENOMEM;
2138 return 0;
2139}
2140
2141/*
2142 * Generic message allocator, for incoming messages.
2143 */
2144static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2145 struct ceph_msg_header *hdr,
2146 int *skip)
2147{
2148 int type = le16_to_cpu(hdr->type);
2149 int front_len = le32_to_cpu(hdr->front_len);
2150 int middle_len = le32_to_cpu(hdr->middle_len);
2151 struct ceph_msg *msg = NULL;
2152 int ret;
2153
2154 if (con->ops->alloc_msg) {
2155 mutex_unlock(&con->mutex);
2156 msg = con->ops->alloc_msg(con, hdr, skip);
2157 mutex_lock(&con->mutex);
2158 if (IS_ERR(msg))
2159 return msg;
2160
2161 if (*skip)
2162 return NULL;
2163 }
2164 if (!msg) {
2165 *skip = 0;
2166 msg = ceph_msg_new(type, front_len, 0, 0, NULL);
2167 if (!msg) {
2168 pr_err("unable to allocate msg type %d len %d\n",
2169 type, front_len);
2170 return ERR_PTR(-ENOMEM);
2171 }
2172 }
2173 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2174
2175 if (middle_len) {
2176 ret = ceph_alloc_middle(con, msg);
2177
2178 if (ret < 0) {
2179 ceph_msg_put(msg);
2180 return msg;
2181 }
2182 }
2183
2184 return msg;
2185}
2186
2187
2188/*
2189 * Free a generically kmalloc'd message.
2190 */
2191void ceph_msg_kfree(struct ceph_msg *m)
2192{
2193 dout("msg_kfree %p\n", m);
2194 if (m->front_is_vmalloc)
2195 vfree(m->front.iov_base);
2196 else
2197 kfree(m->front.iov_base);
2198 kfree(m);
2199}
2200
2201/*
2202 * Drop a msg ref. Destroy as needed.
2203 */
2204void ceph_msg_last_put(struct kref *kref)
2205{
2206 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
2207
2208 dout("ceph_msg_put last one on %p\n", m);
2209 WARN_ON(!list_empty(&m->list_head));
2210
2211 /* drop middle, data, if any */
2212 if (m->middle) {
2213 ceph_buffer_put(m->middle);
2214 m->middle = NULL;
2215 }
2216 m->nr_pages = 0;
2217 m->pages = NULL;
2218
2219 if (m->pagelist) {
2220 ceph_pagelist_release(m->pagelist);
2221 kfree(m->pagelist);
2222 m->pagelist = NULL;
2223 }
2224
2225 if (m->pool)
2226 ceph_msgpool_put(m->pool, m);
2227 else
2228 ceph_msg_kfree(m);
2229}
2230
2231void ceph_msg_dump(struct ceph_msg *msg)
2232{
2233 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
2234 msg->front_max, msg->nr_pages);
2235 print_hex_dump(KERN_DEBUG, "header: ",
2236 DUMP_PREFIX_OFFSET, 16, 1,
2237 &msg->hdr, sizeof(msg->hdr), true);
2238 print_hex_dump(KERN_DEBUG, " front: ",
2239 DUMP_PREFIX_OFFSET, 16, 1,
2240 msg->front.iov_base, msg->front.iov_len, true);
2241 if (msg->middle)
2242 print_hex_dump(KERN_DEBUG, "middle: ",
2243 DUMP_PREFIX_OFFSET, 16, 1,
2244 msg->middle->vec.iov_base,
2245 msg->middle->vec.iov_len, true);
2246 print_hex_dump(KERN_DEBUG, "footer: ",
2247 DUMP_PREFIX_OFFSET, 16, 1,
2248 &msg->footer, sizeof(msg->footer), true);
2249}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
new file mode 100644
index 000000000000..a343dae73cdc
--- /dev/null
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,255 @@
1#ifndef __FS_CEPH_MESSENGER_H
2#define __FS_CEPH_MESSENGER_H
3
4#include <linux/kref.h>
5#include <linux/mutex.h>
6#include <linux/net.h>
7#include <linux/radix-tree.h>
8#include <linux/uio.h>
9#include <linux/version.h>
10#include <linux/workqueue.h>
11
12#include "types.h"
13#include "buffer.h"
14
15struct ceph_msg;
16struct ceph_connection;
17
18extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
19
20/*
21 * Ceph defines these callbacks for handling connection events.
22 */
23struct ceph_connection_operations {
24 struct ceph_connection *(*get)(struct ceph_connection *);
25 void (*put)(struct ceph_connection *);
26
27 /* handle an incoming message. */
28 void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
29
30 /* authorize an outgoing connection */
31 int (*get_authorizer) (struct ceph_connection *con,
32 void **buf, int *len, int *proto,
33 void **reply_buf, int *reply_len, int force_new);
34 int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
35 int (*invalidate_authorizer)(struct ceph_connection *con);
36
37 /* protocol version mismatch */
38 void (*bad_proto) (struct ceph_connection *con);
39
40 /* there was some error on the socket (disconnect, whatever) */
41 void (*fault) (struct ceph_connection *con);
42
43 /* a remote host as terminated a message exchange session, and messages
44 * we sent (or they tried to send us) may be lost. */
45 void (*peer_reset) (struct ceph_connection *con);
46
47 struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
48 struct ceph_msg_header *hdr,
49 int *skip);
50};
51
52extern const char *ceph_name_type_str(int t);
53
54/* use format string %s%d */
55#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
56
57struct ceph_messenger {
58 struct ceph_entity_inst inst; /* my name+address */
59 struct ceph_entity_addr my_enc_addr;
60 struct page *zero_page; /* used in certain error cases */
61
62 bool nocrc;
63
64 /*
65 * the global_seq counts connections i (attempt to) initiate
66 * in order to disambiguate certain connect race conditions.
67 */
68 u32 global_seq;
69 spinlock_t global_seq_lock;
70};
71
72/*
73 * a single message. it contains a header (src, dest, message type, etc.),
74 * footer (crc values, mainly), a "front" message body, and possibly a
75 * data payload (stored in some number of pages).
76 */
77struct ceph_msg {
78 struct ceph_msg_header hdr; /* header */
79 struct ceph_msg_footer footer; /* footer */
80 struct kvec front; /* unaligned blobs of message */
81 struct ceph_buffer *middle;
82 struct page **pages; /* data payload. NOT OWNER. */
83 unsigned nr_pages; /* size of page array */
84 struct ceph_pagelist *pagelist; /* instead of pages */
85 struct list_head list_head;
86 struct kref kref;
87 bool front_is_vmalloc;
88 bool more_to_follow;
89 int front_max;
90
91 struct ceph_msgpool *pool;
92};
93
94struct ceph_msg_pos {
95 int page, page_pos; /* which page; offset in page */
96 int data_pos; /* offset in data payload */
97 int did_page_crc; /* true if we've calculated crc for current page */
98};
99
100/* ceph connection fault delay defaults, for exponential backoff */
101#define BASE_DELAY_INTERVAL (HZ/2)
102#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
103
104/*
105 * ceph_connection state bit flags
106 *
107 * QUEUED and BUSY are used together to ensure that only a single
108 * thread is currently opening, reading or writing data to the socket.
109 */
110#define LOSSYTX 0 /* we can close channel or drop messages on errors */
111#define CONNECTING 1
112#define NEGOTIATING 2
113#define KEEPALIVE_PENDING 3
114#define WRITE_PENDING 4 /* we have data ready to send */
115#define QUEUED 5 /* there is work queued on this connection */
116#define BUSY 6 /* work is being done */
117#define STANDBY 8 /* no outgoing messages, socket closed. we keep
118 * the ceph_connection around to maintain shared
119 * state with the peer. */
120#define CLOSED 10 /* we've closed the connection */
121#define SOCK_CLOSED 11 /* socket state changed to closed */
122#define OPENING 13 /* open connection w/ (possibly new) peer */
123#define DEAD 14 /* dead, about to kfree */
124
125/*
126 * A single connection with another host.
127 *
128 * We maintain a queue of outgoing messages, and some session state to
129 * ensure that we can preserve the lossless, ordered delivery of
130 * messages in the case of a TCP disconnect.
131 */
132struct ceph_connection {
133 void *private;
134 atomic_t nref;
135
136 const struct ceph_connection_operations *ops;
137
138 struct ceph_messenger *msgr;
139 struct socket *sock;
140 unsigned long state; /* connection state (see flags above) */
141 const char *error_msg; /* error message, if any */
142
143 struct ceph_entity_addr peer_addr; /* peer address */
144 struct ceph_entity_name peer_name; /* peer name */
145 struct ceph_entity_addr peer_addr_for_me;
146 u32 connect_seq; /* identify the most recent connection
147 attempt for this connection, client */
148 u32 peer_global_seq; /* peer's global seq for this connection */
149
150 int auth_retry; /* true if we need a newer authorizer */
151 void *auth_reply_buf; /* where to put the authorizer reply */
152 int auth_reply_buf_len;
153
154 struct mutex mutex;
155
156 /* out queue */
157 struct list_head out_queue;
158 struct list_head out_sent; /* sending or sent but unacked */
159 u64 out_seq; /* last message queued for send */
160 u64 out_seq_sent; /* last message sent */
161 bool out_keepalive_pending;
162
163 u64 in_seq, in_seq_acked; /* last message received, acked */
164
165 /* connection negotiation temps */
166 char in_banner[CEPH_BANNER_MAX_LEN];
167 union {
168 struct { /* outgoing connection */
169 struct ceph_msg_connect out_connect;
170 struct ceph_msg_connect_reply in_reply;
171 };
172 struct { /* incoming */
173 struct ceph_msg_connect in_connect;
174 struct ceph_msg_connect_reply out_reply;
175 };
176 };
177 struct ceph_entity_addr actual_peer_addr;
178
179 /* message out temps */
180 struct ceph_msg *out_msg; /* sending message (== tail of
181 out_sent) */
182 bool out_msg_done;
183 struct ceph_msg_pos out_msg_pos;
184
185 struct kvec out_kvec[8], /* sending header/footer data */
186 *out_kvec_cur;
187 int out_kvec_left; /* kvec's left in out_kvec */
188 int out_skip; /* skip this many bytes */
189 int out_kvec_bytes; /* total bytes left */
190 bool out_kvec_is_msg; /* kvec refers to out_msg */
191 int out_more; /* there is more data after the kvecs */
192 __le64 out_temp_ack; /* for writing an ack */
193
194 /* message in temps */
195 struct ceph_msg_header in_hdr;
196 struct ceph_msg *in_msg;
197 struct ceph_msg_pos in_msg_pos;
198 u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
199
200 char in_tag; /* protocol control byte */
201 int in_base_pos; /* bytes read */
202 __le64 in_temp_ack; /* for reading an ack */
203
204 struct delayed_work work; /* send|recv work */
205 unsigned long delay; /* current delay interval */
206};
207
208
209extern const char *pr_addr(const struct sockaddr_storage *ss);
210extern int ceph_parse_ips(const char *c, const char *end,
211 struct ceph_entity_addr *addr,
212 int max_count, int *count);
213
214
215extern int ceph_msgr_init(void);
216extern void ceph_msgr_exit(void);
217
218extern struct ceph_messenger *ceph_messenger_create(
219 struct ceph_entity_addr *myaddr);
220extern void ceph_messenger_destroy(struct ceph_messenger *);
221
222extern void ceph_con_init(struct ceph_messenger *msgr,
223 struct ceph_connection *con);
224extern void ceph_con_open(struct ceph_connection *con,
225 struct ceph_entity_addr *addr);
226extern bool ceph_con_opened(struct ceph_connection *con);
227extern void ceph_con_close(struct ceph_connection *con);
228extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
229extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
230extern void ceph_con_revoke_message(struct ceph_connection *con,
231 struct ceph_msg *msg);
232extern void ceph_con_keepalive(struct ceph_connection *con);
233extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
234extern void ceph_con_put(struct ceph_connection *con);
235
236extern struct ceph_msg *ceph_msg_new(int type, int front_len,
237 int page_len, int page_off,
238 struct page **pages);
239extern void ceph_msg_kfree(struct ceph_msg *m);
240
241
242static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
243{
244 kref_get(&msg->kref);
245 return msg;
246}
247extern void ceph_msg_last_put(struct kref *kref);
248static inline void ceph_msg_put(struct ceph_msg *msg)
249{
250 kref_put(&msg->kref, ceph_msg_last_put);
251}
252
253extern void ceph_msg_dump(struct ceph_msg *msg);
254
255#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
new file mode 100644
index 000000000000..8fdc011ca956
--- /dev/null
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,835 @@
1#include "ceph_debug.h"
2
3#include <linux/types.h>
4#include <linux/slab.h>
5#include <linux/random.h>
6#include <linux/sched.h>
7
8#include "mon_client.h"
9#include "super.h"
10#include "auth.h"
11#include "decode.h"
12
13/*
14 * Interact with Ceph monitor cluster. Handle requests for new map
15 * versions, and periodically resend as needed. Also implement
16 * statfs() and umount().
17 *
18 * A small cluster of Ceph "monitors" are responsible for managing critical
19 * cluster configuration and state information. An odd number (e.g., 3, 5)
20 * of cmon daemons use a modified version of the Paxos part-time parliament
21 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
22 * list of clients who have mounted the file system.
23 *
24 * We maintain an open, active session with a monitor at all times in order to
25 * receive timely MDSMap updates. We periodically send a keepalive byte on the
26 * TCP socket to ensure we detect a failure. If the connection does break, we
27 * randomly hunt for a new monitor. Once the connection is reestablished, we
28 * resend any outstanding requests.
29 */
30
31const static struct ceph_connection_operations mon_con_ops;
32
33static int __validate_auth(struct ceph_mon_client *monc);
34
35/*
36 * Decode a monmap blob (e.g., during mount).
37 */
38struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
39{
40 struct ceph_monmap *m = NULL;
41 int i, err = -EINVAL;
42 struct ceph_fsid fsid;
43 u32 epoch, num_mon;
44 u16 version;
45 u32 len;
46
47 ceph_decode_32_safe(&p, end, len, bad);
48 ceph_decode_need(&p, end, len, bad);
49
50 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
51
52 ceph_decode_16_safe(&p, end, version, bad);
53
54 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
55 ceph_decode_copy(&p, &fsid, sizeof(fsid));
56 epoch = ceph_decode_32(&p);
57
58 num_mon = ceph_decode_32(&p);
59 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
60
61 if (num_mon >= CEPH_MAX_MON)
62 goto bad;
63 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
64 if (m == NULL)
65 return ERR_PTR(-ENOMEM);
66 m->fsid = fsid;
67 m->epoch = epoch;
68 m->num_mon = num_mon;
69 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
70 for (i = 0; i < num_mon; i++)
71 ceph_decode_addr(&m->mon_inst[i].addr);
72
73 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
74 m->num_mon);
75 for (i = 0; i < m->num_mon; i++)
76 dout("monmap_decode mon%d is %s\n", i,
77 pr_addr(&m->mon_inst[i].addr.in_addr));
78 return m;
79
80bad:
81 dout("monmap_decode failed with %d\n", err);
82 kfree(m);
83 return ERR_PTR(err);
84}
85
86/*
87 * return true if *addr is included in the monmap.
88 */
89int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
90{
91 int i;
92
93 for (i = 0; i < m->num_mon; i++)
94 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
95 return 1;
96 return 0;
97}
98
99/*
100 * Send an auth request.
101 */
102static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
103{
104 monc->pending_auth = 1;
105 monc->m_auth->front.iov_len = len;
106 monc->m_auth->hdr.front_len = cpu_to_le32(len);
107 ceph_msg_get(monc->m_auth); /* keep our ref */
108 ceph_con_send(monc->con, monc->m_auth);
109}
110
111/*
112 * Close monitor session, if any.
113 */
114static void __close_session(struct ceph_mon_client *monc)
115{
116 if (monc->con) {
117 dout("__close_session closing mon%d\n", monc->cur_mon);
118 ceph_con_revoke(monc->con, monc->m_auth);
119 ceph_con_close(monc->con);
120 monc->cur_mon = -1;
121 monc->pending_auth = 0;
122 ceph_auth_reset(monc->auth);
123 }
124}
125
126/*
127 * Open a session with a (new) monitor.
128 */
129static int __open_session(struct ceph_mon_client *monc)
130{
131 char r;
132 int ret;
133
134 if (monc->cur_mon < 0) {
135 get_random_bytes(&r, 1);
136 monc->cur_mon = r % monc->monmap->num_mon;
137 dout("open_session num=%d r=%d -> mon%d\n",
138 monc->monmap->num_mon, r, monc->cur_mon);
139 monc->sub_sent = 0;
140 monc->sub_renew_after = jiffies; /* i.e., expired */
141 monc->want_next_osdmap = !!monc->want_next_osdmap;
142
143 dout("open_session mon%d opening\n", monc->cur_mon);
144 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
145 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
146 ceph_con_open(monc->con,
147 &monc->monmap->mon_inst[monc->cur_mon].addr);
148
149 /* initiatiate authentication handshake */
150 ret = ceph_auth_build_hello(monc->auth,
151 monc->m_auth->front.iov_base,
152 monc->m_auth->front_max);
153 __send_prepared_auth_request(monc, ret);
154 } else {
155 dout("open_session mon%d already open\n", monc->cur_mon);
156 }
157 return 0;
158}
159
160static bool __sub_expired(struct ceph_mon_client *monc)
161{
162 return time_after_eq(jiffies, monc->sub_renew_after);
163}
164
165/*
166 * Reschedule delayed work timer.
167 */
168static void __schedule_delayed(struct ceph_mon_client *monc)
169{
170 unsigned delay;
171
172 if (monc->cur_mon < 0 || __sub_expired(monc))
173 delay = 10 * HZ;
174 else
175 delay = 20 * HZ;
176 dout("__schedule_delayed after %u\n", delay);
177 schedule_delayed_work(&monc->delayed_work, delay);
178}
179
180/*
181 * Send subscribe request for mdsmap and/or osdmap.
182 */
183static void __send_subscribe(struct ceph_mon_client *monc)
184{
185 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
186 (unsigned)monc->sub_sent, __sub_expired(monc),
187 monc->want_next_osdmap);
188 if ((__sub_expired(monc) && !monc->sub_sent) ||
189 monc->want_next_osdmap == 1) {
190 struct ceph_msg *msg;
191 struct ceph_mon_subscribe_item *i;
192 void *p, *end;
193
194 msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
195 if (!msg)
196 return;
197
198 p = msg->front.iov_base;
199 end = p + msg->front.iov_len;
200
201 dout("__send_subscribe to 'mdsmap' %u+\n",
202 (unsigned)monc->have_mdsmap);
203 if (monc->want_next_osdmap) {
204 dout("__send_subscribe to 'osdmap' %u\n",
205 (unsigned)monc->have_osdmap);
206 ceph_encode_32(&p, 3);
207 ceph_encode_string(&p, end, "osdmap", 6);
208 i = p;
209 i->have = cpu_to_le64(monc->have_osdmap);
210 i->onetime = 1;
211 p += sizeof(*i);
212 monc->want_next_osdmap = 2; /* requested */
213 } else {
214 ceph_encode_32(&p, 2);
215 }
216 ceph_encode_string(&p, end, "mdsmap", 6);
217 i = p;
218 i->have = cpu_to_le64(monc->have_mdsmap);
219 i->onetime = 0;
220 p += sizeof(*i);
221 ceph_encode_string(&p, end, "monmap", 6);
222 i = p;
223 i->have = 0;
224 i->onetime = 0;
225 p += sizeof(*i);
226
227 msg->front.iov_len = p - msg->front.iov_base;
228 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
229 ceph_con_send(monc->con, msg);
230
231 monc->sub_sent = jiffies | 1; /* never 0 */
232 }
233}
234
235static void handle_subscribe_ack(struct ceph_mon_client *monc,
236 struct ceph_msg *msg)
237{
238 unsigned seconds;
239 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
240
241 if (msg->front.iov_len < sizeof(*h))
242 goto bad;
243 seconds = le32_to_cpu(h->duration);
244
245 mutex_lock(&monc->mutex);
246 if (monc->hunting) {
247 pr_info("mon%d %s session established\n",
248 monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
249 monc->hunting = false;
250 }
251 dout("handle_subscribe_ack after %d seconds\n", seconds);
252 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
253 monc->sub_sent = 0;
254 mutex_unlock(&monc->mutex);
255 return;
256bad:
257 pr_err("got corrupt subscribe-ack msg\n");
258 ceph_msg_dump(msg);
259}
260
261/*
262 * Keep track of which maps we have
263 */
264int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
265{
266 mutex_lock(&monc->mutex);
267 monc->have_mdsmap = got;
268 mutex_unlock(&monc->mutex);
269 return 0;
270}
271
272int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
273{
274 mutex_lock(&monc->mutex);
275 monc->have_osdmap = got;
276 monc->want_next_osdmap = 0;
277 mutex_unlock(&monc->mutex);
278 return 0;
279}
280
281/*
282 * Register interest in the next osdmap
283 */
284void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
285{
286 dout("request_next_osdmap have %u\n", monc->have_osdmap);
287 mutex_lock(&monc->mutex);
288 if (!monc->want_next_osdmap)
289 monc->want_next_osdmap = 1;
290 if (monc->want_next_osdmap < 2)
291 __send_subscribe(monc);
292 mutex_unlock(&monc->mutex);
293}
294
295/*
296 *
297 */
298int ceph_monc_open_session(struct ceph_mon_client *monc)
299{
300 if (!monc->con) {
301 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
302 if (!monc->con)
303 return -ENOMEM;
304 ceph_con_init(monc->client->msgr, monc->con);
305 monc->con->private = monc;
306 monc->con->ops = &mon_con_ops;
307 }
308
309 mutex_lock(&monc->mutex);
310 __open_session(monc);
311 __schedule_delayed(monc);
312 mutex_unlock(&monc->mutex);
313 return 0;
314}
315
316/*
317 * The monitor responds with mount ack indicate mount success. The
318 * included client ticket allows the client to talk to MDSs and OSDs.
319 */
320static void ceph_monc_handle_map(struct ceph_mon_client *monc,
321 struct ceph_msg *msg)
322{
323 struct ceph_client *client = monc->client;
324 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
325 void *p, *end;
326
327 mutex_lock(&monc->mutex);
328
329 dout("handle_monmap\n");
330 p = msg->front.iov_base;
331 end = p + msg->front.iov_len;
332
333 monmap = ceph_monmap_decode(p, end);
334 if (IS_ERR(monmap)) {
335 pr_err("problem decoding monmap, %d\n",
336 (int)PTR_ERR(monmap));
337 goto out;
338 }
339
340 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
341 kfree(monmap);
342 goto out;
343 }
344
345 client->monc.monmap = monmap;
346 kfree(old);
347
348out:
349 mutex_unlock(&monc->mutex);
350 wake_up(&client->auth_wq);
351}
352
353/*
354 * statfs
355 */
356static struct ceph_mon_statfs_request *__lookup_statfs(
357 struct ceph_mon_client *monc, u64 tid)
358{
359 struct ceph_mon_statfs_request *req;
360 struct rb_node *n = monc->statfs_request_tree.rb_node;
361
362 while (n) {
363 req = rb_entry(n, struct ceph_mon_statfs_request, node);
364 if (tid < req->tid)
365 n = n->rb_left;
366 else if (tid > req->tid)
367 n = n->rb_right;
368 else
369 return req;
370 }
371 return NULL;
372}
373
374static void __insert_statfs(struct ceph_mon_client *monc,
375 struct ceph_mon_statfs_request *new)
376{
377 struct rb_node **p = &monc->statfs_request_tree.rb_node;
378 struct rb_node *parent = NULL;
379 struct ceph_mon_statfs_request *req = NULL;
380
381 while (*p) {
382 parent = *p;
383 req = rb_entry(parent, struct ceph_mon_statfs_request, node);
384 if (new->tid < req->tid)
385 p = &(*p)->rb_left;
386 else if (new->tid > req->tid)
387 p = &(*p)->rb_right;
388 else
389 BUG();
390 }
391
392 rb_link_node(&new->node, parent, p);
393 rb_insert_color(&new->node, &monc->statfs_request_tree);
394}
395
396static void handle_statfs_reply(struct ceph_mon_client *monc,
397 struct ceph_msg *msg)
398{
399 struct ceph_mon_statfs_request *req;
400 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
401 u64 tid;
402
403 if (msg->front.iov_len != sizeof(*reply))
404 goto bad;
405 tid = le64_to_cpu(msg->hdr.tid);
406 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
407
408 mutex_lock(&monc->mutex);
409 req = __lookup_statfs(monc, tid);
410 if (req) {
411 *req->buf = reply->st;
412 req->result = 0;
413 }
414 mutex_unlock(&monc->mutex);
415 if (req)
416 complete(&req->completion);
417 return;
418
419bad:
420 pr_err("corrupt statfs reply, no tid\n");
421 ceph_msg_dump(msg);
422}
423
424/*
425 * (re)send a statfs request
426 */
427static int send_statfs(struct ceph_mon_client *monc,
428 struct ceph_mon_statfs_request *req)
429{
430 struct ceph_msg *msg;
431 struct ceph_mon_statfs *h;
432
433 dout("send_statfs tid %llu\n", req->tid);
434 msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
435 if (IS_ERR(msg))
436 return PTR_ERR(msg);
437 req->request = msg;
438 msg->hdr.tid = cpu_to_le64(req->tid);
439 h = msg->front.iov_base;
440 h->monhdr.have_version = 0;
441 h->monhdr.session_mon = cpu_to_le16(-1);
442 h->monhdr.session_mon_tid = 0;
443 h->fsid = monc->monmap->fsid;
444 ceph_con_send(monc->con, msg);
445 return 0;
446}
447
448/*
449 * Do a synchronous statfs().
450 */
451int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
452{
453 struct ceph_mon_statfs_request req;
454 int err;
455
456 req.buf = buf;
457 init_completion(&req.completion);
458
459 /* allocate memory for reply */
460 err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
461 if (err)
462 return err;
463
464 /* register request */
465 mutex_lock(&monc->mutex);
466 req.tid = ++monc->last_tid;
467 req.last_attempt = jiffies;
468 req.delay = BASE_DELAY_INTERVAL;
469 __insert_statfs(monc, &req);
470 monc->num_statfs_requests++;
471 mutex_unlock(&monc->mutex);
472
473 /* send request and wait */
474 err = send_statfs(monc, &req);
475 if (!err)
476 err = wait_for_completion_interruptible(&req.completion);
477
478 mutex_lock(&monc->mutex);
479 rb_erase(&req.node, &monc->statfs_request_tree);
480 monc->num_statfs_requests--;
481 ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
482 mutex_unlock(&monc->mutex);
483
484 if (!err)
485 err = req.result;
486 return err;
487}
488
489/*
490 * Resend pending statfs requests.
491 */
492static void __resend_statfs(struct ceph_mon_client *monc)
493{
494 struct ceph_mon_statfs_request *req;
495 struct rb_node *p;
496
497 for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
498 req = rb_entry(p, struct ceph_mon_statfs_request, node);
499 send_statfs(monc, req);
500 }
501}
502
503/*
504 * Delayed work. If we haven't mounted yet, retry. Otherwise,
505 * renew/retry subscription as needed (in case it is timing out, or we
506 * got an ENOMEM). And keep the monitor connection alive.
507 */
508static void delayed_work(struct work_struct *work)
509{
510 struct ceph_mon_client *monc =
511 container_of(work, struct ceph_mon_client, delayed_work.work);
512
513 dout("monc delayed_work\n");
514 mutex_lock(&monc->mutex);
515 if (monc->hunting) {
516 __close_session(monc);
517 __open_session(monc); /* continue hunting */
518 } else {
519 ceph_con_keepalive(monc->con);
520
521 __validate_auth(monc);
522
523 if (monc->auth->ops->is_authenticated(monc->auth))
524 __send_subscribe(monc);
525 }
526 __schedule_delayed(monc);
527 mutex_unlock(&monc->mutex);
528}
529
530/*
531 * On startup, we build a temporary monmap populated with the IPs
532 * provided by mount(2).
533 */
534static int build_initial_monmap(struct ceph_mon_client *monc)
535{
536 struct ceph_mount_args *args = monc->client->mount_args;
537 struct ceph_entity_addr *mon_addr = args->mon_addr;
538 int num_mon = args->num_mon;
539 int i;
540
541 /* build initial monmap */
542 monc->monmap = kzalloc(sizeof(*monc->monmap) +
543 num_mon*sizeof(monc->monmap->mon_inst[0]),
544 GFP_KERNEL);
545 if (!monc->monmap)
546 return -ENOMEM;
547 for (i = 0; i < num_mon; i++) {
548 monc->monmap->mon_inst[i].addr = mon_addr[i];
549 monc->monmap->mon_inst[i].addr.nonce = 0;
550 monc->monmap->mon_inst[i].name.type =
551 CEPH_ENTITY_TYPE_MON;
552 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
553 }
554 monc->monmap->num_mon = num_mon;
555 monc->have_fsid = false;
556
557 /* release addr memory */
558 kfree(args->mon_addr);
559 args->mon_addr = NULL;
560 args->num_mon = 0;
561 return 0;
562}
563
564int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
565{
566 int err = 0;
567
568 dout("init\n");
569 memset(monc, 0, sizeof(*monc));
570 monc->client = cl;
571 monc->monmap = NULL;
572 mutex_init(&monc->mutex);
573
574 err = build_initial_monmap(monc);
575 if (err)
576 goto out;
577
578 monc->con = NULL;
579
580 /* authentication */
581 monc->auth = ceph_auth_init(cl->mount_args->name,
582 cl->mount_args->secret);
583 if (IS_ERR(monc->auth))
584 return PTR_ERR(monc->auth);
585 monc->auth->want_keys =
586 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
587 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
588
589 /* msg pools */
590 err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
591 sizeof(struct ceph_mon_subscribe_ack), 1, false);
592 if (err < 0)
593 goto out_monmap;
594 err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
595 sizeof(struct ceph_mon_statfs_reply), 0, false);
596 if (err < 0)
597 goto out_pool1;
598 err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
599 if (err < 0)
600 goto out_pool2;
601
602 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
603 monc->pending_auth = 0;
604 if (IS_ERR(monc->m_auth)) {
605 err = PTR_ERR(monc->m_auth);
606 monc->m_auth = NULL;
607 goto out_pool3;
608 }
609
610 monc->cur_mon = -1;
611 monc->hunting = true;
612 monc->sub_renew_after = jiffies;
613 monc->sub_sent = 0;
614
615 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
616 monc->statfs_request_tree = RB_ROOT;
617 monc->num_statfs_requests = 0;
618 monc->last_tid = 0;
619
620 monc->have_mdsmap = 0;
621 monc->have_osdmap = 0;
622 monc->want_next_osdmap = 1;
623 return 0;
624
625out_pool3:
626 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
627out_pool2:
628 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
629out_pool1:
630 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
631out_monmap:
632 kfree(monc->monmap);
633out:
634 return err;
635}
636
637void ceph_monc_stop(struct ceph_mon_client *monc)
638{
639 dout("stop\n");
640 cancel_delayed_work_sync(&monc->delayed_work);
641
642 mutex_lock(&monc->mutex);
643 __close_session(monc);
644 if (monc->con) {
645 monc->con->private = NULL;
646 monc->con->ops->put(monc->con);
647 monc->con = NULL;
648 }
649 mutex_unlock(&monc->mutex);
650
651 ceph_auth_destroy(monc->auth);
652
653 ceph_msg_put(monc->m_auth);
654 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
655 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
656 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
657
658 kfree(monc->monmap);
659}
660
661static void handle_auth_reply(struct ceph_mon_client *monc,
662 struct ceph_msg *msg)
663{
664 int ret;
665
666 mutex_lock(&monc->mutex);
667 monc->pending_auth = 0;
668 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
669 msg->front.iov_len,
670 monc->m_auth->front.iov_base,
671 monc->m_auth->front_max);
672 if (ret < 0) {
673 monc->client->auth_err = ret;
674 wake_up(&monc->client->auth_wq);
675 } else if (ret > 0) {
676 __send_prepared_auth_request(monc, ret);
677 } else if (monc->auth->ops->is_authenticated(monc->auth)) {
678 dout("authenticated, starting session\n");
679
680 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
681 monc->client->msgr->inst.name.num = monc->auth->global_id;
682
683 __send_subscribe(monc);
684 __resend_statfs(monc);
685 }
686 mutex_unlock(&monc->mutex);
687}
688
689static int __validate_auth(struct ceph_mon_client *monc)
690{
691 int ret;
692
693 if (monc->pending_auth)
694 return 0;
695
696 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
697 monc->m_auth->front_max);
698 if (ret <= 0)
699 return ret; /* either an error, or no need to authenticate */
700 __send_prepared_auth_request(monc, ret);
701 return 0;
702}
703
704int ceph_monc_validate_auth(struct ceph_mon_client *monc)
705{
706 int ret;
707
708 mutex_lock(&monc->mutex);
709 ret = __validate_auth(monc);
710 mutex_unlock(&monc->mutex);
711 return ret;
712}
713
714/*
715 * handle incoming message
716 */
717static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
718{
719 struct ceph_mon_client *monc = con->private;
720 int type = le16_to_cpu(msg->hdr.type);
721
722 if (!monc)
723 return;
724
725 switch (type) {
726 case CEPH_MSG_AUTH_REPLY:
727 handle_auth_reply(monc, msg);
728 break;
729
730 case CEPH_MSG_MON_SUBSCRIBE_ACK:
731 handle_subscribe_ack(monc, msg);
732 break;
733
734 case CEPH_MSG_STATFS_REPLY:
735 handle_statfs_reply(monc, msg);
736 break;
737
738 case CEPH_MSG_MON_MAP:
739 ceph_monc_handle_map(monc, msg);
740 break;
741
742 case CEPH_MSG_MDS_MAP:
743 ceph_mdsc_handle_map(&monc->client->mdsc, msg);
744 break;
745
746 case CEPH_MSG_OSD_MAP:
747 ceph_osdc_handle_map(&monc->client->osdc, msg);
748 break;
749
750 default:
751 pr_err("received unknown message type %d %s\n", type,
752 ceph_msg_type_name(type));
753 }
754 ceph_msg_put(msg);
755}
756
757/*
758 * Allocate memory for incoming message
759 */
760static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
761 struct ceph_msg_header *hdr,
762 int *skip)
763{
764 struct ceph_mon_client *monc = con->private;
765 int type = le16_to_cpu(hdr->type);
766 int front_len = le32_to_cpu(hdr->front_len);
767 struct ceph_msg *m = NULL;
768
769 *skip = 0;
770
771 switch (type) {
772 case CEPH_MSG_MON_SUBSCRIBE_ACK:
773 m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
774 break;
775 case CEPH_MSG_STATFS_REPLY:
776 m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
777 break;
778 case CEPH_MSG_AUTH_REPLY:
779 m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
780 break;
781 case CEPH_MSG_MON_MAP:
782 case CEPH_MSG_MDS_MAP:
783 case CEPH_MSG_OSD_MAP:
784 m = ceph_msg_new(type, front_len, 0, 0, NULL);
785 break;
786 }
787
788 if (!m) {
789 pr_info("alloc_msg unknown type %d\n", type);
790 *skip = 1;
791 }
792 return m;
793}
794
795/*
796 * If the monitor connection resets, pick a new monitor and resubmit
797 * any pending requests.
798 */
799static void mon_fault(struct ceph_connection *con)
800{
801 struct ceph_mon_client *monc = con->private;
802
803 if (!monc)
804 return;
805
806 dout("mon_fault\n");
807 mutex_lock(&monc->mutex);
808 if (!con->private)
809 goto out;
810
811 if (monc->con && !monc->hunting)
812 pr_info("mon%d %s session lost, "
813 "hunting for new mon\n", monc->cur_mon,
814 pr_addr(&monc->con->peer_addr.in_addr));
815
816 __close_session(monc);
817 if (!monc->hunting) {
818 /* start hunting */
819 monc->hunting = true;
820 __open_session(monc);
821 } else {
822 /* already hunting, let's wait a bit */
823 __schedule_delayed(monc);
824 }
825out:
826 mutex_unlock(&monc->mutex);
827}
828
829const static struct ceph_connection_operations mon_con_ops = {
830 .get = ceph_con_get,
831 .put = ceph_con_put,
832 .dispatch = dispatch,
833 .fault = mon_fault,
834 .alloc_msg = mon_alloc_msg,
835};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
new file mode 100644
index 000000000000..b958ad5afa06
--- /dev/null
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,119 @@
1#ifndef _FS_CEPH_MON_CLIENT_H
2#define _FS_CEPH_MON_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/rbtree.h>
6
7#include "messenger.h"
8#include "msgpool.h"
9
10struct ceph_client;
11struct ceph_mount_args;
12struct ceph_auth_client;
13
14/*
15 * The monitor map enumerates the set of all monitors.
16 */
17struct ceph_monmap {
18 struct ceph_fsid fsid;
19 u32 epoch;
20 u32 num_mon;
21 struct ceph_entity_inst mon_inst[0];
22};
23
24struct ceph_mon_client;
25struct ceph_mon_statfs_request;
26
27
28/*
29 * Generic mechanism for resending monitor requests.
30 */
31typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
32 int newmon);
33
34/* a pending monitor request */
35struct ceph_mon_request {
36 struct ceph_mon_client *monc;
37 struct delayed_work delayed_work;
38 unsigned long delay;
39 ceph_monc_request_func_t do_request;
40};
41
42/*
43 * statfs() is done a bit differently because we need to get data back
44 * to the caller
45 */
46struct ceph_mon_statfs_request {
47 u64 tid;
48 struct rb_node node;
49 int result;
50 struct ceph_statfs *buf;
51 struct completion completion;
52 unsigned long last_attempt, delay; /* jiffies */
53 struct ceph_msg *request; /* original request */
54};
55
56struct ceph_mon_client {
57 struct ceph_client *client;
58 struct ceph_monmap *monmap;
59
60 struct mutex mutex;
61 struct delayed_work delayed_work;
62
63 struct ceph_auth_client *auth;
64 struct ceph_msg *m_auth;
65 int pending_auth;
66
67 bool hunting;
68 int cur_mon; /* last monitor i contacted */
69 unsigned long sub_sent, sub_renew_after;
70 struct ceph_connection *con;
71 bool have_fsid;
72
73 /* msg pools */
74 struct ceph_msgpool msgpool_subscribe_ack;
75 struct ceph_msgpool msgpool_statfs_reply;
76 struct ceph_msgpool msgpool_auth_reply;
77
78 /* pending statfs requests */
79 struct rb_root statfs_request_tree;
80 int num_statfs_requests;
81 u64 last_tid;
82
83 /* mds/osd map */
84 int want_next_osdmap; /* 1 = want, 2 = want+asked */
85 u32 have_osdmap, have_mdsmap;
86
87#ifdef CONFIG_DEBUG_FS
88 struct dentry *debugfs_file;
89#endif
90};
91
92extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
93extern int ceph_monmap_contains(struct ceph_monmap *m,
94 struct ceph_entity_addr *addr);
95
96extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
97extern void ceph_monc_stop(struct ceph_mon_client *monc);
98
99/*
100 * The model here is to indicate that we need a new map of at least
101 * epoch @want, and also call in when we receive a map. We will
102 * periodically rerequest the map from the monitor cluster until we
103 * get what we want.
104 */
105extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
106extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
107
108extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
109
110extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
111 struct ceph_statfs *buf);
112
113extern int ceph_monc_open_session(struct ceph_mon_client *monc);
114
115extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
116
117
118
119#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 000000000000..ca3b44a89f2d
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,186 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/sched.h>
5#include <linux/types.h>
6#include <linux/vmalloc.h>
7
8#include "msgpool.h"
9
10/*
11 * We use msg pools to preallocate memory for messages we expect to
12 * receive over the wire, to avoid getting ourselves into OOM
13 * conditions at unexpected times. We take use a few different
14 * strategies:
15 *
16 * - for request/response type interactions, we preallocate the
17 * memory needed for the response when we generate the request.
18 *
19 * - for messages we can receive at any time from the MDS, we preallocate
20 * a pool of messages we can re-use.
21 *
22 * - for writeback, we preallocate some number of messages to use for
23 * requests and their replies, so that we always make forward
24 * progress.
25 *
26 * The msgpool behaves like a mempool_t, but keeps preallocated
27 * ceph_msgs strung together on a list_head instead of using a pointer
28 * vector. This avoids vector reallocation when we adjust the number
29 * of preallocated items (which happens frequently).
30 */
31
32
33/*
34 * Allocate or release as necessary to meet our target pool size.
35 */
36static int __fill_msgpool(struct ceph_msgpool *pool)
37{
38 struct ceph_msg *msg;
39
40 while (pool->num < pool->min) {
41 dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
42 pool->min);
43 spin_unlock(&pool->lock);
44 msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
45 spin_lock(&pool->lock);
46 if (IS_ERR(msg))
47 return PTR_ERR(msg);
48 msg->pool = pool;
49 list_add(&msg->list_head, &pool->msgs);
50 pool->num++;
51 }
52 while (pool->num > pool->min) {
53 msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
54 dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
55 pool->min, msg);
56 list_del_init(&msg->list_head);
57 pool->num--;
58 ceph_msg_kfree(msg);
59 }
60 return 0;
61}
62
63int ceph_msgpool_init(struct ceph_msgpool *pool,
64 int front_len, int min, bool blocking)
65{
66 int ret;
67
68 dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
69 spin_lock_init(&pool->lock);
70 pool->front_len = front_len;
71 INIT_LIST_HEAD(&pool->msgs);
72 pool->num = 0;
73 pool->min = min;
74 pool->blocking = blocking;
75 init_waitqueue_head(&pool->wait);
76
77 spin_lock(&pool->lock);
78 ret = __fill_msgpool(pool);
79 spin_unlock(&pool->lock);
80 return ret;
81}
82
83void ceph_msgpool_destroy(struct ceph_msgpool *pool)
84{
85 dout("msgpool_destroy %p\n", pool);
86 spin_lock(&pool->lock);
87 pool->min = 0;
88 __fill_msgpool(pool);
89 spin_unlock(&pool->lock);
90}
91
92int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
93{
94 int ret;
95
96 spin_lock(&pool->lock);
97 dout("msgpool_resv %p delta %d\n", pool, delta);
98 pool->min += delta;
99 ret = __fill_msgpool(pool);
100 spin_unlock(&pool->lock);
101 return ret;
102}
103
104struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
105{
106 wait_queue_t wait;
107 struct ceph_msg *msg;
108
109 if (front_len && front_len > pool->front_len) {
110 pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
111 pool, front_len, pool->front_len);
112 WARN_ON(1);
113
114 /* try to alloc a fresh message */
115 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
116 if (!IS_ERR(msg))
117 return msg;
118 }
119
120 if (!front_len)
121 front_len = pool->front_len;
122
123 if (pool->blocking) {
124 /* mempool_t behavior; first try to alloc */
125 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
126 if (!IS_ERR(msg))
127 return msg;
128 }
129
130 while (1) {
131 spin_lock(&pool->lock);
132 if (likely(pool->num)) {
133 msg = list_entry(pool->msgs.next, struct ceph_msg,
134 list_head);
135 list_del_init(&msg->list_head);
136 pool->num--;
137 dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
138 pool->num, pool->min);
139 spin_unlock(&pool->lock);
140 return msg;
141 }
142 pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
143 pool->min, pool->blocking ? "waiting" : "may fail");
144 spin_unlock(&pool->lock);
145
146 if (!pool->blocking) {
147 WARN_ON(1);
148
149 /* maybe we can allocate it now? */
150 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
151 if (!IS_ERR(msg))
152 return msg;
153
154 pr_err("msgpool_get %p empty + alloc failed\n", pool);
155 return ERR_PTR(-ENOMEM);
156 }
157
158 init_wait(&wait);
159 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
160 schedule();
161 finish_wait(&pool->wait, &wait);
162 }
163}
164
165void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
166{
167 spin_lock(&pool->lock);
168 if (pool->num < pool->min) {
169 /* reset msg front_len; user may have changed it */
170 msg->front.iov_len = pool->front_len;
171 msg->hdr.front_len = cpu_to_le32(pool->front_len);
172
173 kref_set(&msg->kref, 1); /* retake a single ref */
174 list_add(&msg->list_head, &pool->msgs);
175 pool->num++;
176 dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
177 pool->num, pool->min);
178 spin_unlock(&pool->lock);
179 wake_up(&pool->wait);
180 } else {
181 dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
182 pool->num, pool->min);
183 spin_unlock(&pool->lock);
184 ceph_msg_kfree(msg);
185 }
186}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
new file mode 100644
index 000000000000..bc834bfcd720
--- /dev/null
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,27 @@
1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL
3
4#include "messenger.h"
5
6/*
7 * we use memory pools for preallocating messages we may receive, to
8 * avoid unexpected OOM conditions.
9 */
10struct ceph_msgpool {
11 spinlock_t lock;
12 int front_len; /* preallocated payload size */
13 struct list_head msgs; /* msgs in the pool; each has 1 ref */
14 int num, min; /* cur, min # msgs in the pool */
15 bool blocking;
16 wait_queue_head_t wait;
17};
18
19extern int ceph_msgpool_init(struct ceph_msgpool *pool,
20 int front_len, int size, bool blocking);
21extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
22extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
23extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
24 int front_len);
25extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
26
27#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 000000000000..8aaab414f3f8
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,158 @@
1#ifndef __MSGR_H
2#define __MSGR_H
3
4/*
5 * Data types for message passing layer used by Ceph.
6 */
7
8#define CEPH_MON_PORT 6789 /* default monitor port */
9
10/*
11 * client-side processes will try to bind to ports in this
12 * range, simply for the benefit of tools like nmap or wireshark
13 * that would like to identify the protocol.
14 */
15#define CEPH_PORT_FIRST 6789
16#define CEPH_PORT_START 6800 /* non-monitors start here */
17#define CEPH_PORT_LAST 6900
18
19/*
20 * tcp connection banner. include a protocol version. and adjust
21 * whenever the wire protocol changes. try to keep this string length
22 * constant.
23 */
24#define CEPH_BANNER "ceph v027"
25#define CEPH_BANNER_MAX_LEN 30
26
27
28/*
29 * Rollover-safe type and comparator for 32-bit sequence numbers.
30 * Comparator returns -1, 0, or 1.
31 */
32typedef __u32 ceph_seq_t;
33
34static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
35{
36 return (__s32)a - (__s32)b;
37}
38
39
40/*
41 * entity_name -- logical name for a process participating in the
42 * network, e.g. 'mds0' or 'osd3'.
43 */
44struct ceph_entity_name {
45 __u8 type; /* CEPH_ENTITY_TYPE_* */
46 __le64 num;
47} __attribute__ ((packed));
48
49#define CEPH_ENTITY_TYPE_MON 0x01
50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_ADMIN 0x10
54#define CEPH_ENTITY_TYPE_AUTH 0x20
55
56#define CEPH_ENTITY_TYPE_ANY 0xFF
57
58extern const char *ceph_entity_type_name(int type);
59
60/*
61 * entity_addr -- network address
62 */
63struct ceph_entity_addr {
64 __le32 type;
65 __le32 nonce; /* unique id for process (e.g. pid) */
66 struct sockaddr_storage in_addr;
67} __attribute__ ((packed));
68
69struct ceph_entity_inst {
70 struct ceph_entity_name name;
71 struct ceph_entity_addr addr;
72} __attribute__ ((packed));
73
74
75/* used by message exchange protocol */
76#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
77#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
78#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
79 incoming connection */
80#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
81 with higher cseq */
82#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
83 with higher gseq */
84#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
85#define CEPH_MSGR_TAG_MSG 7 /* message */
86#define CEPH_MSGR_TAG_ACK 8 /* message ack */
87#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
88#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
89#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
90#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
91
92
93/*
94 * connection negotiation
95 */
96struct ceph_msg_connect {
97 __le64 features; /* supported feature bits */
98 __le32 host_type; /* CEPH_ENTITY_TYPE_* */
99 __le32 global_seq; /* count connections initiated by this host */
100 __le32 connect_seq; /* count connections initiated in this session */
101 __le32 protocol_version;
102 __le32 authorizer_protocol;
103 __le32 authorizer_len;
104 __u8 flags; /* CEPH_MSG_CONNECT_* */
105} __attribute__ ((packed));
106
107struct ceph_msg_connect_reply {
108 __u8 tag;
109 __le64 features; /* feature bits for this session */
110 __le32 global_seq;
111 __le32 connect_seq;
112 __le32 protocol_version;
113 __le32 authorizer_len;
114 __u8 flags;
115} __attribute__ ((packed));
116
117#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
118
119
120/*
121 * message header
122 */
123struct ceph_msg_header {
124 __le64 seq; /* message seq# for this session */
125 __le64 tid; /* transaction id */
126 __le16 type; /* message type */
127 __le16 priority; /* priority. higher value == higher priority */
128 __le16 version; /* version of message encoding */
129
130 __le32 front_len; /* bytes in main payload */
131 __le32 middle_len;/* bytes in middle payload */
132 __le32 data_len; /* bytes of data payload */
133 __le16 data_off; /* sender: include full offset;
134 receiver: mask against ~PAGE_MASK */
135
136 struct ceph_entity_inst src, orig_src;
137 __le32 reserved;
138 __le32 crc; /* header crc32c */
139} __attribute__ ((packed));
140
141#define CEPH_MSG_PRIO_LOW 64
142#define CEPH_MSG_PRIO_DEFAULT 127
143#define CEPH_MSG_PRIO_HIGH 196
144#define CEPH_MSG_PRIO_HIGHEST 255
145
146/*
147 * follows data payload
148 */
149struct ceph_msg_footer {
150 __le32 front_crc, middle_crc, data_crc;
151 __u8 flags;
152} __attribute__ ((packed));
153
154#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
155#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
156
157
158#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
new file mode 100644
index 000000000000..c7b4dedaace6
--- /dev/null
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1550 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/highmem.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/slab.h>
8#include <linux/uaccess.h>
9
10#include "super.h"
11#include "osd_client.h"
12#include "messenger.h"
13#include "decode.h"
14#include "auth.h"
15
16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512
18
19const static struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd);
22
23static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
24
25/*
26 * Implement client access to distributed object storage cluster.
27 *
28 * All data objects are stored within a cluster/cloud of OSDs, or
29 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
30 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
31 * remote daemons serving up and coordinating consistent and safe
32 * access to storage.
33 *
34 * Cluster membership and the mapping of data objects onto storage devices
35 * are described by the osd map.
36 *
37 * We keep track of pending OSD requests (read, write), resubmit
38 * requests to different OSDs when the cluster topology/data layout
39 * change, or retry the affected requests when the communications
40 * channel with an OSD is reset.
41 */
42
43/*
44 * calculate the mapping of a file extent onto an object, and fill out the
45 * request accordingly. shorten extent as necessary if it crosses an
46 * object boundary.
47 *
48 * fill osd op in request message.
49 */
50static void calc_layout(struct ceph_osd_client *osdc,
51 struct ceph_vino vino, struct ceph_file_layout *layout,
52 u64 off, u64 *plen,
53 struct ceph_osd_request *req)
54{
55 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
56 struct ceph_osd_op *op = (void *)(reqhead + 1);
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59 u64 bno;
60
61 reqhead->snapid = cpu_to_le64(vino.snap);
62
63 /* object extent? */
64 ceph_calc_file_object_mapping(layout, off, plen, &bno,
65 &objoff, &objlen);
66 if (*plen < orig_len)
67 dout(" skipping last %llu, final file extent %llu~%llu\n",
68 orig_len - *plen, off, *plen);
69
70 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
71 req->r_oid_len = strlen(req->r_oid);
72
73 op->extent.offset = cpu_to_le64(objoff);
74 op->extent.length = cpu_to_le64(objlen);
75 req->r_num_pages = calc_pages_for(off, *plen);
76
77 dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
78 req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
79}
80
81/*
82 * requests
83 */
84void ceph_osdc_release_request(struct kref *kref)
85{
86 struct ceph_osd_request *req = container_of(kref,
87 struct ceph_osd_request,
88 r_kref);
89
90 if (req->r_request)
91 ceph_msg_put(req->r_request);
92 if (req->r_reply)
93 ceph_msg_put(req->r_reply);
94 if (req->r_con_filling_msg) {
95 dout("release_request revoking pages %p from con %p\n",
96 req->r_pages, req->r_con_filling_msg);
97 ceph_con_revoke_message(req->r_con_filling_msg,
98 req->r_reply);
99 ceph_con_put(req->r_con_filling_msg);
100 }
101 if (req->r_own_pages)
102 ceph_release_page_vector(req->r_pages,
103 req->r_num_pages);
104 ceph_put_snap_context(req->r_snapc);
105 if (req->r_mempool)
106 mempool_free(req, req->r_osdc->req_mempool);
107 else
108 kfree(req);
109}
110
111/*
112 * build new request AND message, calculate layout, and adjust file
113 * extent as needed.
114 *
115 * if the file was recently truncated, we include information about its
116 * old and new size so that the object can be updated appropriately. (we
117 * avoid synchronously deleting truncated objects because it's slow.)
118 *
119 * if @do_sync, include a 'startsync' command so that the osd will flush
120 * data quickly.
121 */
122struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 off, u64 *plen,
126 int opcode, int flags,
127 struct ceph_snap_context *snapc,
128 int do_sync,
129 u32 truncate_seq,
130 u64 truncate_size,
131 struct timespec *mtime,
132 bool use_mempool, int num_reply)
133{
134 struct ceph_osd_request *req;
135 struct ceph_msg *msg;
136 struct ceph_osd_request_head *head;
137 struct ceph_osd_op *op;
138 void *p;
139 int num_op = 1 + do_sync;
140 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
141 int i;
142
143 if (use_mempool) {
144 req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
145 memset(req, 0, sizeof(*req));
146 } else {
147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 }
149 if (req == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool;
154 kref_init(&req->r_kref);
155 init_completion(&req->r_completion);
156 init_completion(&req->r_safe_completion);
157 INIT_LIST_HEAD(&req->r_unsafe_item);
158 req->r_flags = flags;
159
160 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
161
162 /* create reply message */
163 if (use_mempool)
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
168 if (IS_ERR(msg)) {
169 ceph_osdc_put_request(req);
170 return ERR_PTR(PTR_ERR(msg));
171 }
172 req->r_reply = msg;
173
174 /* create request message; allow space for oid */
175 msg_size += 40;
176 if (snapc)
177 msg_size += sizeof(u64) * snapc->num_snaps;
178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
182 if (IS_ERR(msg)) {
183 ceph_osdc_put_request(req);
184 return ERR_PTR(PTR_ERR(msg));
185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len);
188 head = msg->front.iov_base;
189 op = (void *)(head + 1);
190 p = (void *)(op + num_op);
191
192 req->r_request = msg;
193 req->r_snapc = ceph_get_snap_context(snapc);
194
195 head->client_inc = cpu_to_le32(1); /* always, for now. */
196 head->flags = cpu_to_le32(flags);
197 if (flags & CEPH_OSD_FLAG_WRITE)
198 ceph_encode_timespec(&head->mtime, mtime);
199 head->num_ops = cpu_to_le16(num_op);
200 op->op = cpu_to_le16(opcode);
201
202 /* calculate max write size */
203 calc_layout(osdc, vino, layout, off, plen, req);
204 req->r_file_layout = *layout; /* keep a copy */
205
206 if (flags & CEPH_OSD_FLAG_WRITE) {
207 req->r_request->hdr.data_off = cpu_to_le16(off);
208 req->r_request->hdr.data_len = cpu_to_le32(*plen);
209 op->payload_len = cpu_to_le32(*plen);
210 }
211 op->extent.truncate_size = cpu_to_le64(truncate_size);
212 op->extent.truncate_seq = cpu_to_le32(truncate_seq);
213
214 /* fill in oid */
215 head->object_len = cpu_to_le32(req->r_oid_len);
216 memcpy(p, req->r_oid, req->r_oid_len);
217 p += req->r_oid_len;
218
219 if (do_sync) {
220 op++;
221 op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
222 }
223 if (snapc) {
224 head->snap_seq = cpu_to_le64(snapc->seq);
225 head->num_snaps = cpu_to_le32(snapc->num_snaps);
226 for (i = 0; i < snapc->num_snaps; i++) {
227 put_unaligned_le64(snapc->snaps[i], p);
228 p += sizeof(u64);
229 }
230 }
231
232 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
233 msg_size = p - msg->front.iov_base;
234 msg->front.iov_len = msg_size;
235 msg->hdr.front_len = cpu_to_le32(msg_size);
236 return req;
237}
238
239/*
240 * We keep osd requests in an rbtree, sorted by ->r_tid.
241 */
242static void __insert_request(struct ceph_osd_client *osdc,
243 struct ceph_osd_request *new)
244{
245 struct rb_node **p = &osdc->requests.rb_node;
246 struct rb_node *parent = NULL;
247 struct ceph_osd_request *req = NULL;
248
249 while (*p) {
250 parent = *p;
251 req = rb_entry(parent, struct ceph_osd_request, r_node);
252 if (new->r_tid < req->r_tid)
253 p = &(*p)->rb_left;
254 else if (new->r_tid > req->r_tid)
255 p = &(*p)->rb_right;
256 else
257 BUG();
258 }
259
260 rb_link_node(&new->r_node, parent, p);
261 rb_insert_color(&new->r_node, &osdc->requests);
262}
263
264static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
265 u64 tid)
266{
267 struct ceph_osd_request *req;
268 struct rb_node *n = osdc->requests.rb_node;
269
270 while (n) {
271 req = rb_entry(n, struct ceph_osd_request, r_node);
272 if (tid < req->r_tid)
273 n = n->rb_left;
274 else if (tid > req->r_tid)
275 n = n->rb_right;
276 else
277 return req;
278 }
279 return NULL;
280}
281
282static struct ceph_osd_request *
283__lookup_request_ge(struct ceph_osd_client *osdc,
284 u64 tid)
285{
286 struct ceph_osd_request *req;
287 struct rb_node *n = osdc->requests.rb_node;
288
289 while (n) {
290 req = rb_entry(n, struct ceph_osd_request, r_node);
291 if (tid < req->r_tid) {
292 if (!n->rb_left)
293 return req;
294 n = n->rb_left;
295 } else if (tid > req->r_tid) {
296 n = n->rb_right;
297 } else {
298 return req;
299 }
300 }
301 return NULL;
302}
303
304
305/*
306 * If the osd connection drops, we need to resubmit all requests.
307 */
308static void osd_reset(struct ceph_connection *con)
309{
310 struct ceph_osd *osd = con->private;
311 struct ceph_osd_client *osdc;
312
313 if (!osd)
314 return;
315 dout("osd_reset osd%d\n", osd->o_osd);
316 osdc = osd->o_osdc;
317 down_read(&osdc->map_sem);
318 kick_requests(osdc, osd);
319 up_read(&osdc->map_sem);
320}
321
322/*
323 * Track open sessions with osds.
324 */
325static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
326{
327 struct ceph_osd *osd;
328
329 osd = kzalloc(sizeof(*osd), GFP_NOFS);
330 if (!osd)
331 return NULL;
332
333 atomic_set(&osd->o_ref, 1);
334 osd->o_osdc = osdc;
335 INIT_LIST_HEAD(&osd->o_requests);
336 INIT_LIST_HEAD(&osd->o_osd_lru);
337 osd->o_incarnation = 1;
338
339 ceph_con_init(osdc->client->msgr, &osd->o_con);
340 osd->o_con.private = osd;
341 osd->o_con.ops = &osd_con_ops;
342 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
343
344 INIT_LIST_HEAD(&osd->o_keepalive_item);
345 return osd;
346}
347
348static struct ceph_osd *get_osd(struct ceph_osd *osd)
349{
350 if (atomic_inc_not_zero(&osd->o_ref)) {
351 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
352 atomic_read(&osd->o_ref));
353 return osd;
354 } else {
355 dout("get_osd %p FAIL\n", osd);
356 return NULL;
357 }
358}
359
360static void put_osd(struct ceph_osd *osd)
361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref))
365 kfree(osd);
366}
367
368/*
369 * remove an osd from our map
370 */
371static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
372{
373 dout("__remove_osd %p\n", osd);
374 BUG_ON(!list_empty(&osd->o_requests));
375 rb_erase(&osd->o_node, &osdc->osds);
376 list_del_init(&osd->o_osd_lru);
377 ceph_con_close(&osd->o_con);
378 put_osd(osd);
379}
380
381static void __move_osd_to_lru(struct ceph_osd_client *osdc,
382 struct ceph_osd *osd)
383{
384 dout("__move_osd_to_lru %p\n", osd);
385 BUG_ON(!list_empty(&osd->o_osd_lru));
386 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
387 osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
388}
389
390static void __remove_osd_from_lru(struct ceph_osd *osd)
391{
392 dout("__remove_osd_from_lru %p\n", osd);
393 if (!list_empty(&osd->o_osd_lru))
394 list_del_init(&osd->o_osd_lru);
395}
396
397static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
398{
399 struct ceph_osd *osd, *nosd;
400
401 dout("__remove_old_osds %p\n", osdc);
402 mutex_lock(&osdc->request_mutex);
403 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
404 if (!remove_all && time_before(jiffies, osd->lru_ttl))
405 break;
406 __remove_osd(osdc, osd);
407 }
408 mutex_unlock(&osdc->request_mutex);
409}
410
411/*
412 * reset osd connect
413 */
414static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
415{
416 struct ceph_osd_request *req;
417 int ret = 0;
418
419 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
420 if (list_empty(&osd->o_requests)) {
421 __remove_osd(osdc, osd);
422 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
423 &osd->o_con.peer_addr,
424 sizeof(osd->o_con.peer_addr)) == 0 &&
425 !ceph_con_opened(&osd->o_con)) {
426 dout(" osd addr hasn't changed and connection never opened,"
427 " letting msgr retry");
428 /* touch each r_stamp for handle_timeout()'s benfit */
429 list_for_each_entry(req, &osd->o_requests, r_osd_item)
430 req->r_stamp = jiffies;
431 ret = -EAGAIN;
432 } else {
433 ceph_con_close(&osd->o_con);
434 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
435 osd->o_incarnation++;
436 }
437 return ret;
438}
439
440static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
441{
442 struct rb_node **p = &osdc->osds.rb_node;
443 struct rb_node *parent = NULL;
444 struct ceph_osd *osd = NULL;
445
446 while (*p) {
447 parent = *p;
448 osd = rb_entry(parent, struct ceph_osd, o_node);
449 if (new->o_osd < osd->o_osd)
450 p = &(*p)->rb_left;
451 else if (new->o_osd > osd->o_osd)
452 p = &(*p)->rb_right;
453 else
454 BUG();
455 }
456
457 rb_link_node(&new->o_node, parent, p);
458 rb_insert_color(&new->o_node, &osdc->osds);
459}
460
461static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
462{
463 struct ceph_osd *osd;
464 struct rb_node *n = osdc->osds.rb_node;
465
466 while (n) {
467 osd = rb_entry(n, struct ceph_osd, o_node);
468 if (o < osd->o_osd)
469 n = n->rb_left;
470 else if (o > osd->o_osd)
471 n = n->rb_right;
472 else
473 return osd;
474 }
475 return NULL;
476}
477
478static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
479{
480 schedule_delayed_work(&osdc->timeout_work,
481 osdc->client->mount_args->osd_keepalive_timeout * HZ);
482}
483
484static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
485{
486 cancel_delayed_work(&osdc->timeout_work);
487}
488
489/*
490 * Register request, assign tid. If this is the first request, set up
491 * the timeout event.
492 */
493static void register_request(struct ceph_osd_client *osdc,
494 struct ceph_osd_request *req)
495{
496 mutex_lock(&osdc->request_mutex);
497 req->r_tid = ++osdc->last_tid;
498 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
499 INIT_LIST_HEAD(&req->r_req_lru_item);
500
501 dout("register_request %p tid %lld\n", req, req->r_tid);
502 __insert_request(osdc, req);
503 ceph_osdc_get_request(req);
504 osdc->num_requests++;
505
506 if (osdc->num_requests == 1) {
507 dout(" first request, scheduling timeout\n");
508 __schedule_osd_timeout(osdc);
509 }
510 mutex_unlock(&osdc->request_mutex);
511}
512
513/*
514 * called under osdc->request_mutex
515 */
516static void __unregister_request(struct ceph_osd_client *osdc,
517 struct ceph_osd_request *req)
518{
519 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
520 rb_erase(&req->r_node, &osdc->requests);
521 osdc->num_requests--;
522
523 if (req->r_osd) {
524 /* make sure the original request isn't in flight. */
525 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
526
527 list_del_init(&req->r_osd_item);
528 if (list_empty(&req->r_osd->o_requests))
529 __move_osd_to_lru(osdc, req->r_osd);
530 req->r_osd = NULL;
531 }
532
533 ceph_osdc_put_request(req);
534
535 list_del_init(&req->r_req_lru_item);
536 if (osdc->num_requests == 0) {
537 dout(" no requests, canceling timeout\n");
538 __cancel_osd_timeout(osdc);
539 }
540}
541
542/*
543 * Cancel a previously queued request message
544 */
545static void __cancel_request(struct ceph_osd_request *req)
546{
547 if (req->r_sent) {
548 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
549 req->r_sent = 0;
550 }
551 list_del_init(&req->r_req_lru_item);
552}
553
554/*
555 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
556 * (as needed), and set the request r_osd appropriately. If there is
557 * no up osd, set r_osd to NULL.
558 *
559 * Return 0 if unchanged, 1 if changed, or negative on error.
560 *
561 * Caller should hold map_sem for read and request_mutex.
562 */
563static int __map_osds(struct ceph_osd_client *osdc,
564 struct ceph_osd_request *req)
565{
566 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
567 struct ceph_pg pgid;
568 int o = -1;
569 int err;
570
571 dout("map_osds %p tid %lld\n", req, req->r_tid);
572 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
573 &req->r_file_layout, osdc->osdmap);
574 if (err)
575 return err;
576 pgid = reqhead->layout.ol_pgid;
577 req->r_pgid = pgid;
578
579 o = ceph_calc_pg_primary(osdc->osdmap, pgid);
580
581 if ((req->r_osd && req->r_osd->o_osd == o &&
582 req->r_sent >= req->r_osd->o_incarnation) ||
583 (req->r_osd == NULL && o == -1))
584 return 0; /* no change */
585
586 dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
587 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
588 req->r_osd ? req->r_osd->o_osd : -1);
589
590 if (req->r_osd) {
591 __cancel_request(req);
592 list_del_init(&req->r_osd_item);
593 req->r_osd = NULL;
594 }
595
596 req->r_osd = __lookup_osd(osdc, o);
597 if (!req->r_osd && o >= 0) {
598 err = -ENOMEM;
599 req->r_osd = create_osd(osdc);
600 if (!req->r_osd)
601 goto out;
602
603 dout("map_osds osd %p is osd%d\n", req->r_osd, o);
604 req->r_osd->o_osd = o;
605 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
606 __insert_osd(osdc, req->r_osd);
607
608 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
609 }
610
611 if (req->r_osd) {
612 __remove_osd_from_lru(req->r_osd);
613 list_add(&req->r_osd_item, &req->r_osd->o_requests);
614 }
615 err = 1; /* osd changed */
616
617out:
618 return err;
619}
620
621/*
622 * caller should hold map_sem (for read) and request_mutex
623 */
624static int __send_request(struct ceph_osd_client *osdc,
625 struct ceph_osd_request *req)
626{
627 struct ceph_osd_request_head *reqhead;
628 int err;
629
630 err = __map_osds(osdc, req);
631 if (err < 0)
632 return err;
633 if (req->r_osd == NULL) {
634 dout("send_request %p no up osds in pg\n", req);
635 ceph_monc_request_next_osdmap(&osdc->client->monc);
636 return 0;
637 }
638
639 dout("send_request %p tid %llu to osd%d flags %d\n",
640 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
641
642 reqhead = req->r_request->front.iov_base;
643 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
644 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
645 reqhead->reassert_version = req->r_reassert_version;
646
647 req->r_stamp = jiffies;
648 list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
649
650 ceph_msg_get(req->r_request); /* send consumes a ref */
651 ceph_con_send(&req->r_osd->o_con, req->r_request);
652 req->r_sent = req->r_osd->o_incarnation;
653 return 0;
654}
655
656/*
657 * Timeout callback, called every N seconds when 1 or more osd
658 * requests has been active for more than N seconds. When this
659 * happens, we ping all OSDs with requests who have timed out to
660 * ensure any communications channel reset is detected. Reset the
661 * request timeouts another N seconds in the future as we go.
662 * Reschedule the timeout event another N seconds in future (unless
663 * there are no open requests).
664 */
665static void handle_timeout(struct work_struct *work)
666{
667 struct ceph_osd_client *osdc =
668 container_of(work, struct ceph_osd_client, timeout_work.work);
669 struct ceph_osd_request *req, *last_req = NULL;
670 struct ceph_osd *osd;
671 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
672 unsigned long keepalive =
673 osdc->client->mount_args->osd_keepalive_timeout * HZ;
674 unsigned long last_stamp = 0;
675 struct rb_node *p;
676 struct list_head slow_osds;
677
678 dout("timeout\n");
679 down_read(&osdc->map_sem);
680
681 ceph_monc_request_next_osdmap(&osdc->client->monc);
682
683 mutex_lock(&osdc->request_mutex);
684 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
685 req = rb_entry(p, struct ceph_osd_request, r_node);
686
687 if (req->r_resend) {
688 int err;
689
690 dout("osdc resending prev failed %lld\n", req->r_tid);
691 err = __send_request(osdc, req);
692 if (err)
693 dout("osdc failed again on %lld\n", req->r_tid);
694 else
695 req->r_resend = false;
696 continue;
697 }
698 }
699
700 /*
701 * reset osds that appear to be _really_ unresponsive. this
702 * is a failsafe measure.. we really shouldn't be getting to
703 * this point if the system is working properly. the monitors
704 * should mark the osd as failed and we should find out about
705 * it from an updated osd map.
706 */
707 while (!list_empty(&osdc->req_lru)) {
708 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
709 r_req_lru_item);
710
711 if (time_before(jiffies, req->r_stamp + timeout))
712 break;
713
714 BUG_ON(req == last_req && req->r_stamp == last_stamp);
715 last_req = req;
716 last_stamp = req->r_stamp;
717
718 osd = req->r_osd;
719 BUG_ON(!osd);
720 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
721 req->r_tid, osd->o_osd);
722 __kick_requests(osdc, osd);
723 }
724
725 /*
726 * ping osds that are a bit slow. this ensures that if there
727 * is a break in the TCP connection we will notice, and reopen
728 * a connection with that osd (from the fault callback).
729 */
730 INIT_LIST_HEAD(&slow_osds);
731 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
732 if (time_before(jiffies, req->r_stamp + keepalive))
733 break;
734
735 osd = req->r_osd;
736 BUG_ON(!osd);
737 dout(" tid %llu is slow, will send keepalive on osd%d\n",
738 req->r_tid, osd->o_osd);
739 list_move_tail(&osd->o_keepalive_item, &slow_osds);
740 }
741 while (!list_empty(&slow_osds)) {
742 osd = list_entry(slow_osds.next, struct ceph_osd,
743 o_keepalive_item);
744 list_del_init(&osd->o_keepalive_item);
745 ceph_con_keepalive(&osd->o_con);
746 }
747
748 __schedule_osd_timeout(osdc);
749 mutex_unlock(&osdc->request_mutex);
750
751 up_read(&osdc->map_sem);
752}
753
754static void handle_osds_timeout(struct work_struct *work)
755{
756 struct ceph_osd_client *osdc =
757 container_of(work, struct ceph_osd_client,
758 osds_timeout_work.work);
759 unsigned long delay =
760 osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
761
762 dout("osds timeout\n");
763 down_read(&osdc->map_sem);
764 remove_old_osds(osdc, 0);
765 up_read(&osdc->map_sem);
766
767 schedule_delayed_work(&osdc->osds_timeout_work,
768 round_jiffies_relative(delay));
769}
770
771/*
772 * handle osd op reply. either call the callback if it is specified,
773 * or do the completion to wake up the waiting thread.
774 */
775static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
776 struct ceph_connection *con)
777{
778 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
779 struct ceph_osd_request *req;
780 u64 tid;
781 int numops, object_len, flags;
782
783 tid = le64_to_cpu(msg->hdr.tid);
784 if (msg->front.iov_len < sizeof(*rhead))
785 goto bad;
786 numops = le32_to_cpu(rhead->num_ops);
787 object_len = le32_to_cpu(rhead->object_len);
788 if (msg->front.iov_len != sizeof(*rhead) + object_len +
789 numops * sizeof(struct ceph_osd_op))
790 goto bad;
791 dout("handle_reply %p tid %llu\n", msg, tid);
792
793 /* lookup */
794 mutex_lock(&osdc->request_mutex);
795 req = __lookup_request(osdc, tid);
796 if (req == NULL) {
797 dout("handle_reply tid %llu dne\n", tid);
798 mutex_unlock(&osdc->request_mutex);
799 return;
800 }
801 ceph_osdc_get_request(req);
802 flags = le32_to_cpu(rhead->flags);
803
804 /*
805 * if this connection filled our message, drop our reference now, to
806 * avoid a (safe but slower) revoke later.
807 */
808 if (req->r_con_filling_msg == con && req->r_reply == msg) {
809 dout(" dropping con_filling_msg ref %p\n", con);
810 req->r_con_filling_msg = NULL;
811 ceph_con_put(con);
812 }
813
814 if (!req->r_got_reply) {
815 unsigned bytes;
816
817 req->r_result = le32_to_cpu(rhead->result);
818 bytes = le32_to_cpu(msg->hdr.data_len);
819 dout("handle_reply result %d bytes %d\n", req->r_result,
820 bytes);
821 if (req->r_result == 0)
822 req->r_result = bytes;
823
824 /* in case this is a write and we need to replay, */
825 req->r_reassert_version = rhead->reassert_version;
826
827 req->r_got_reply = 1;
828 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
829 dout("handle_reply tid %llu dup ack\n", tid);
830 mutex_unlock(&osdc->request_mutex);
831 goto done;
832 }
833
834 dout("handle_reply tid %llu flags %d\n", tid, flags);
835
836 /* either this is a read, or we got the safe response */
837 if ((flags & CEPH_OSD_FLAG_ONDISK) ||
838 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
839 __unregister_request(osdc, req);
840
841 mutex_unlock(&osdc->request_mutex);
842
843 if (req->r_callback)
844 req->r_callback(req, msg);
845 else
846 complete(&req->r_completion);
847
848 if (flags & CEPH_OSD_FLAG_ONDISK) {
849 if (req->r_safe_callback)
850 req->r_safe_callback(req, msg);
851 complete(&req->r_safe_completion); /* fsync waiter */
852 }
853
854done:
855 ceph_osdc_put_request(req);
856 return;
857
858bad:
859 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
860 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
861 (int)sizeof(*rhead));
862 ceph_msg_dump(msg);
863}
864
865
866static int __kick_requests(struct ceph_osd_client *osdc,
867 struct ceph_osd *kickosd)
868{
869 struct ceph_osd_request *req;
870 struct rb_node *p, *n;
871 int needmap = 0;
872 int err;
873
874 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
875 if (kickosd) {
876 err = __reset_osd(osdc, kickosd);
877 if (err == -EAGAIN)
878 return 1;
879 } else {
880 for (p = rb_first(&osdc->osds); p; p = n) {
881 struct ceph_osd *osd =
882 rb_entry(p, struct ceph_osd, o_node);
883
884 n = rb_next(p);
885 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
886 memcmp(&osd->o_con.peer_addr,
887 ceph_osd_addr(osdc->osdmap,
888 osd->o_osd),
889 sizeof(struct ceph_entity_addr)) != 0)
890 __reset_osd(osdc, osd);
891 }
892 }
893
894 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
895 req = rb_entry(p, struct ceph_osd_request, r_node);
896
897 if (req->r_resend) {
898 dout(" r_resend set on tid %llu\n", req->r_tid);
899 __cancel_request(req);
900 goto kick;
901 }
902 if (req->r_osd && kickosd == req->r_osd) {
903 __cancel_request(req);
904 goto kick;
905 }
906
907 err = __map_osds(osdc, req);
908 if (err == 0)
909 continue; /* no change */
910 if (err < 0) {
911 /*
912 * FIXME: really, we should set the request
913 * error and fail if this isn't a 'nofail'
914 * request, but that's a fair bit more
915 * complicated to do. So retry!
916 */
917 dout(" setting r_resend on %llu\n", req->r_tid);
918 req->r_resend = true;
919 continue;
920 }
921 if (req->r_osd == NULL) {
922 dout("tid %llu maps to no valid osd\n", req->r_tid);
923 needmap++; /* request a newer map */
924 continue;
925 }
926
927kick:
928 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
929 req->r_osd ? req->r_osd->o_osd : -1);
930 req->r_flags |= CEPH_OSD_FLAG_RETRY;
931 err = __send_request(osdc, req);
932 if (err) {
933 dout(" setting r_resend on %llu\n", req->r_tid);
934 req->r_resend = true;
935 }
936 }
937
938 return needmap;
939}
940
941/*
942 * Resubmit osd requests whose osd or osd address has changed. Request
943 * a new osd map if osds are down, or we are otherwise unable to determine
944 * how to direct a request.
945 *
946 * Close connections to down osds.
947 *
948 * If @who is specified, resubmit requests for that specific osd.
949 *
950 * Caller should hold map_sem for read and request_mutex.
951 */
952static void kick_requests(struct ceph_osd_client *osdc,
953 struct ceph_osd *kickosd)
954{
955 int needmap;
956
957 mutex_lock(&osdc->request_mutex);
958 needmap = __kick_requests(osdc, kickosd);
959 mutex_unlock(&osdc->request_mutex);
960
961 if (needmap) {
962 dout("%d requests for down osds, need new map\n", needmap);
963 ceph_monc_request_next_osdmap(&osdc->client->monc);
964 }
965
966}
967/*
968 * Process updated osd map.
969 *
970 * The message contains any number of incremental and full maps, normally
971 * indicating some sort of topology change in the cluster. Kick requests
972 * off to different OSDs as needed.
973 */
974void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
975{
976 void *p, *end, *next;
977 u32 nr_maps, maplen;
978 u32 epoch;
979 struct ceph_osdmap *newmap = NULL, *oldmap;
980 int err;
981 struct ceph_fsid fsid;
982
983 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
984 p = msg->front.iov_base;
985 end = p + msg->front.iov_len;
986
987 /* verify fsid */
988 ceph_decode_need(&p, end, sizeof(fsid), bad);
989 ceph_decode_copy(&p, &fsid, sizeof(fsid));
990 if (ceph_check_fsid(osdc->client, &fsid) < 0)
991 return;
992
993 down_write(&osdc->map_sem);
994
995 /* incremental maps */
996 ceph_decode_32_safe(&p, end, nr_maps, bad);
997 dout(" %d inc maps\n", nr_maps);
998 while (nr_maps > 0) {
999 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1000 epoch = ceph_decode_32(&p);
1001 maplen = ceph_decode_32(&p);
1002 ceph_decode_need(&p, end, maplen, bad);
1003 next = p + maplen;
1004 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
1005 dout("applying incremental map %u len %d\n",
1006 epoch, maplen);
1007 newmap = osdmap_apply_incremental(&p, next,
1008 osdc->osdmap,
1009 osdc->client->msgr);
1010 if (IS_ERR(newmap)) {
1011 err = PTR_ERR(newmap);
1012 goto bad;
1013 }
1014 BUG_ON(!newmap);
1015 if (newmap != osdc->osdmap) {
1016 ceph_osdmap_destroy(osdc->osdmap);
1017 osdc->osdmap = newmap;
1018 }
1019 } else {
1020 dout("ignoring incremental map %u len %d\n",
1021 epoch, maplen);
1022 }
1023 p = next;
1024 nr_maps--;
1025 }
1026 if (newmap)
1027 goto done;
1028
1029 /* full maps */
1030 ceph_decode_32_safe(&p, end, nr_maps, bad);
1031 dout(" %d full maps\n", nr_maps);
1032 while (nr_maps) {
1033 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1034 epoch = ceph_decode_32(&p);
1035 maplen = ceph_decode_32(&p);
1036 ceph_decode_need(&p, end, maplen, bad);
1037 if (nr_maps > 1) {
1038 dout("skipping non-latest full map %u len %d\n",
1039 epoch, maplen);
1040 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1041 dout("skipping full map %u len %d, "
1042 "older than our %u\n", epoch, maplen,
1043 osdc->osdmap->epoch);
1044 } else {
1045 dout("taking full map %u len %d\n", epoch, maplen);
1046 newmap = osdmap_decode(&p, p+maplen);
1047 if (IS_ERR(newmap)) {
1048 err = PTR_ERR(newmap);
1049 goto bad;
1050 }
1051 BUG_ON(!newmap);
1052 oldmap = osdc->osdmap;
1053 osdc->osdmap = newmap;
1054 if (oldmap)
1055 ceph_osdmap_destroy(oldmap);
1056 }
1057 p += maplen;
1058 nr_maps--;
1059 }
1060
1061done:
1062 downgrade_write(&osdc->map_sem);
1063 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1064 if (newmap)
1065 kick_requests(osdc, NULL);
1066 up_read(&osdc->map_sem);
1067 return;
1068
1069bad:
1070 pr_err("osdc handle_map corrupt msg\n");
1071 ceph_msg_dump(msg);
1072 up_write(&osdc->map_sem);
1073 return;
1074}
1075
1076
1077/*
1078 * A read request prepares specific pages that data is to be read into.
1079 * When a message is being read off the wire, we call prepare_pages to
1080 * find those pages.
1081 * 0 = success, -1 failure.
1082 */
1083static int __prepare_pages(struct ceph_connection *con,
1084 struct ceph_msg_header *hdr,
1085 struct ceph_osd_request *req,
1086 u64 tid,
1087 struct ceph_msg *m)
1088{
1089 struct ceph_osd *osd = con->private;
1090 struct ceph_osd_client *osdc;
1091 int ret = -1;
1092 int data_len = le32_to_cpu(hdr->data_len);
1093 unsigned data_off = le16_to_cpu(hdr->data_off);
1094
1095 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1096
1097 if (!osd)
1098 return -1;
1099
1100 osdc = osd->o_osdc;
1101
1102 dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
1103 tid, req->r_num_pages, want);
1104 if (unlikely(req->r_num_pages < want))
1105 goto out;
1106 m->pages = req->r_pages;
1107 m->nr_pages = req->r_num_pages;
1108 ret = 0; /* success */
1109out:
1110 BUG_ON(ret < 0 || m->nr_pages < want);
1111
1112 return ret;
1113}
1114
1115/*
1116 * Register request, send initial attempt.
1117 */
1118int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1119 struct ceph_osd_request *req,
1120 bool nofail)
1121{
1122 int rc = 0;
1123
1124 req->r_request->pages = req->r_pages;
1125 req->r_request->nr_pages = req->r_num_pages;
1126
1127 register_request(osdc, req);
1128
1129 down_read(&osdc->map_sem);
1130 mutex_lock(&osdc->request_mutex);
1131 /*
1132 * a racing kick_requests() may have sent the message for us
1133 * while we dropped request_mutex above, so only send now if
1134 * the request still han't been touched yet.
1135 */
1136 if (req->r_sent == 0) {
1137 rc = __send_request(osdc, req);
1138 if (rc) {
1139 if (nofail) {
1140 dout("osdc_start_request failed send, "
1141 " marking %lld\n", req->r_tid);
1142 req->r_resend = true;
1143 rc = 0;
1144 } else {
1145 __unregister_request(osdc, req);
1146 }
1147 }
1148 }
1149 mutex_unlock(&osdc->request_mutex);
1150 up_read(&osdc->map_sem);
1151 return rc;
1152}
1153
1154/*
1155 * wait for a request to complete
1156 */
1157int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1158 struct ceph_osd_request *req)
1159{
1160 int rc;
1161
1162 rc = wait_for_completion_interruptible(&req->r_completion);
1163 if (rc < 0) {
1164 mutex_lock(&osdc->request_mutex);
1165 __cancel_request(req);
1166 __unregister_request(osdc, req);
1167 mutex_unlock(&osdc->request_mutex);
1168 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1169 return rc;
1170 }
1171
1172 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1173 return req->r_result;
1174}
1175
1176/*
1177 * sync - wait for all in-flight requests to flush. avoid starvation.
1178 */
1179void ceph_osdc_sync(struct ceph_osd_client *osdc)
1180{
1181 struct ceph_osd_request *req;
1182 u64 last_tid, next_tid = 0;
1183
1184 mutex_lock(&osdc->request_mutex);
1185 last_tid = osdc->last_tid;
1186 while (1) {
1187 req = __lookup_request_ge(osdc, next_tid);
1188 if (!req)
1189 break;
1190 if (req->r_tid > last_tid)
1191 break;
1192
1193 next_tid = req->r_tid + 1;
1194 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1195 continue;
1196
1197 ceph_osdc_get_request(req);
1198 mutex_unlock(&osdc->request_mutex);
1199 dout("sync waiting on tid %llu (last is %llu)\n",
1200 req->r_tid, last_tid);
1201 wait_for_completion(&req->r_safe_completion);
1202 mutex_lock(&osdc->request_mutex);
1203 ceph_osdc_put_request(req);
1204 }
1205 mutex_unlock(&osdc->request_mutex);
1206 dout("sync done (thru tid %llu)\n", last_tid);
1207}
1208
1209/*
1210 * init, shutdown
1211 */
1212int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1213{
1214 int err;
1215
1216 dout("init\n");
1217 osdc->client = client;
1218 osdc->osdmap = NULL;
1219 init_rwsem(&osdc->map_sem);
1220 init_completion(&osdc->map_waiters);
1221 osdc->last_requested_map = 0;
1222 mutex_init(&osdc->request_mutex);
1223 osdc->last_tid = 0;
1224 osdc->osds = RB_ROOT;
1225 INIT_LIST_HEAD(&osdc->osd_lru);
1226 osdc->requests = RB_ROOT;
1227 INIT_LIST_HEAD(&osdc->req_lru);
1228 osdc->num_requests = 0;
1229 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1230 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1231
1232 schedule_delayed_work(&osdc->osds_timeout_work,
1233 round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
1234
1235 err = -ENOMEM;
1236 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1237 sizeof(struct ceph_osd_request));
1238 if (!osdc->req_mempool)
1239 goto out;
1240
1241 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
1242 if (err < 0)
1243 goto out_mempool;
1244 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1245 OSD_OPREPLY_FRONT_LEN, 10, true);
1246 if (err < 0)
1247 goto out_msgpool;
1248 return 0;
1249
1250out_msgpool:
1251 ceph_msgpool_destroy(&osdc->msgpool_op);
1252out_mempool:
1253 mempool_destroy(osdc->req_mempool);
1254out:
1255 return err;
1256}
1257
1258void ceph_osdc_stop(struct ceph_osd_client *osdc)
1259{
1260 cancel_delayed_work_sync(&osdc->timeout_work);
1261 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1262 if (osdc->osdmap) {
1263 ceph_osdmap_destroy(osdc->osdmap);
1264 osdc->osdmap = NULL;
1265 }
1266 remove_old_osds(osdc, 1);
1267 mempool_destroy(osdc->req_mempool);
1268 ceph_msgpool_destroy(&osdc->msgpool_op);
1269 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1270}
1271
1272/*
1273 * Read some contiguous pages. If we cross a stripe boundary, shorten
1274 * *plen. Return number of bytes read, or error.
1275 */
1276int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1277 struct ceph_vino vino, struct ceph_file_layout *layout,
1278 u64 off, u64 *plen,
1279 u32 truncate_seq, u64 truncate_size,
1280 struct page **pages, int num_pages)
1281{
1282 struct ceph_osd_request *req;
1283 int rc = 0;
1284
1285 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1286 vino.snap, off, *plen);
1287 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1288 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1289 NULL, 0, truncate_seq, truncate_size, NULL,
1290 false, 1);
1291 if (IS_ERR(req))
1292 return PTR_ERR(req);
1293
1294 /* it may be a short read due to an object boundary */
1295 req->r_pages = pages;
1296 num_pages = calc_pages_for(off, *plen);
1297 req->r_num_pages = num_pages;
1298
1299 dout("readpages final extent is %llu~%llu (%d pages)\n",
1300 off, *plen, req->r_num_pages);
1301
1302 rc = ceph_osdc_start_request(osdc, req, false);
1303 if (!rc)
1304 rc = ceph_osdc_wait_request(osdc, req);
1305
1306 ceph_osdc_put_request(req);
1307 dout("readpages result %d\n", rc);
1308 return rc;
1309}
1310
1311/*
1312 * do a synchronous write on N pages
1313 */
1314int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1315 struct ceph_file_layout *layout,
1316 struct ceph_snap_context *snapc,
1317 u64 off, u64 len,
1318 u32 truncate_seq, u64 truncate_size,
1319 struct timespec *mtime,
1320 struct page **pages, int num_pages,
1321 int flags, int do_sync, bool nofail)
1322{
1323 struct ceph_osd_request *req;
1324 int rc = 0;
1325
1326 BUG_ON(vino.snap != CEPH_NOSNAP);
1327 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1328 CEPH_OSD_OP_WRITE,
1329 flags | CEPH_OSD_FLAG_ONDISK |
1330 CEPH_OSD_FLAG_WRITE,
1331 snapc, do_sync,
1332 truncate_seq, truncate_size, mtime,
1333 nofail, 1);
1334 if (IS_ERR(req))
1335 return PTR_ERR(req);
1336
1337 /* it may be a short write due to an object boundary */
1338 req->r_pages = pages;
1339 req->r_num_pages = calc_pages_for(off, len);
1340 dout("writepages %llu~%llu (%d pages)\n", off, len,
1341 req->r_num_pages);
1342
1343 rc = ceph_osdc_start_request(osdc, req, nofail);
1344 if (!rc)
1345 rc = ceph_osdc_wait_request(osdc, req);
1346
1347 ceph_osdc_put_request(req);
1348 if (rc == 0)
1349 rc = len;
1350 dout("writepages result %d\n", rc);
1351 return rc;
1352}
1353
1354/*
1355 * handle incoming message
1356 */
1357static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1358{
1359 struct ceph_osd *osd = con->private;
1360 struct ceph_osd_client *osdc;
1361 int type = le16_to_cpu(msg->hdr.type);
1362
1363 if (!osd)
1364 return;
1365 osdc = osd->o_osdc;
1366
1367 switch (type) {
1368 case CEPH_MSG_OSD_MAP:
1369 ceph_osdc_handle_map(osdc, msg);
1370 break;
1371 case CEPH_MSG_OSD_OPREPLY:
1372 handle_reply(osdc, msg, con);
1373 break;
1374
1375 default:
1376 pr_err("received unknown message type %d %s\n", type,
1377 ceph_msg_type_name(type));
1378 }
1379 ceph_msg_put(msg);
1380}
1381
1382/*
1383 * lookup and return message for incoming reply
1384 */
1385static struct ceph_msg *get_reply(struct ceph_connection *con,
1386 struct ceph_msg_header *hdr,
1387 int *skip)
1388{
1389 struct ceph_osd *osd = con->private;
1390 struct ceph_osd_client *osdc = osd->o_osdc;
1391 struct ceph_msg *m;
1392 struct ceph_osd_request *req;
1393 int front = le32_to_cpu(hdr->front_len);
1394 int data_len = le32_to_cpu(hdr->data_len);
1395 u64 tid;
1396 int err;
1397
1398 tid = le64_to_cpu(hdr->tid);
1399 mutex_lock(&osdc->request_mutex);
1400 req = __lookup_request(osdc, tid);
1401 if (!req) {
1402 *skip = 1;
1403 m = NULL;
1404 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
1405 osd->o_osd);
1406 goto out;
1407 }
1408
1409 if (req->r_con_filling_msg) {
1410 dout("get_reply revoking msg %p from old con %p\n",
1411 req->r_reply, req->r_con_filling_msg);
1412 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1413 ceph_con_put(req->r_con_filling_msg);
1414 }
1415
1416 if (front > req->r_reply->front.iov_len) {
1417 pr_warning("get_reply front %d > preallocated %d\n",
1418 front, (int)req->r_reply->front.iov_len);
1419 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
1420 if (IS_ERR(m))
1421 goto out;
1422 ceph_msg_put(req->r_reply);
1423 req->r_reply = m;
1424 }
1425 m = ceph_msg_get(req->r_reply);
1426
1427 if (data_len > 0) {
1428 err = __prepare_pages(con, hdr, req, tid, m);
1429 if (err < 0) {
1430 *skip = 1;
1431 ceph_msg_put(m);
1432 m = ERR_PTR(err);
1433 }
1434 }
1435 *skip = 0;
1436 req->r_con_filling_msg = ceph_con_get(con);
1437 dout("get_reply tid %lld %p\n", tid, m);
1438
1439out:
1440 mutex_unlock(&osdc->request_mutex);
1441 return m;
1442
1443}
1444
1445static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1446 struct ceph_msg_header *hdr,
1447 int *skip)
1448{
1449 struct ceph_osd *osd = con->private;
1450 int type = le16_to_cpu(hdr->type);
1451 int front = le32_to_cpu(hdr->front_len);
1452
1453 switch (type) {
1454 case CEPH_MSG_OSD_MAP:
1455 return ceph_msg_new(type, front, 0, 0, NULL);
1456 case CEPH_MSG_OSD_OPREPLY:
1457 return get_reply(con, hdr, skip);
1458 default:
1459 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
1460 osd->o_osd);
1461 *skip = 1;
1462 return NULL;
1463 }
1464}
1465
1466/*
1467 * Wrappers to refcount containing ceph_osd struct
1468 */
1469static struct ceph_connection *get_osd_con(struct ceph_connection *con)
1470{
1471 struct ceph_osd *osd = con->private;
1472 if (get_osd(osd))
1473 return con;
1474 return NULL;
1475}
1476
1477static void put_osd_con(struct ceph_connection *con)
1478{
1479 struct ceph_osd *osd = con->private;
1480 put_osd(osd);
1481}
1482
1483/*
1484 * authentication
1485 */
1486static int get_authorizer(struct ceph_connection *con,
1487 void **buf, int *len, int *proto,
1488 void **reply_buf, int *reply_len, int force_new)
1489{
1490 struct ceph_osd *o = con->private;
1491 struct ceph_osd_client *osdc = o->o_osdc;
1492 struct ceph_auth_client *ac = osdc->client->monc.auth;
1493 int ret = 0;
1494
1495 if (force_new && o->o_authorizer) {
1496 ac->ops->destroy_authorizer(ac, o->o_authorizer);
1497 o->o_authorizer = NULL;
1498 }
1499 if (o->o_authorizer == NULL) {
1500 ret = ac->ops->create_authorizer(
1501 ac, CEPH_ENTITY_TYPE_OSD,
1502 &o->o_authorizer,
1503 &o->o_authorizer_buf,
1504 &o->o_authorizer_buf_len,
1505 &o->o_authorizer_reply_buf,
1506 &o->o_authorizer_reply_buf_len);
1507 if (ret)
1508 return ret;
1509 }
1510
1511 *proto = ac->protocol;
1512 *buf = o->o_authorizer_buf;
1513 *len = o->o_authorizer_buf_len;
1514 *reply_buf = o->o_authorizer_reply_buf;
1515 *reply_len = o->o_authorizer_reply_buf_len;
1516 return 0;
1517}
1518
1519
1520static int verify_authorizer_reply(struct ceph_connection *con, int len)
1521{
1522 struct ceph_osd *o = con->private;
1523 struct ceph_osd_client *osdc = o->o_osdc;
1524 struct ceph_auth_client *ac = osdc->client->monc.auth;
1525
1526 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
1527}
1528
1529static int invalidate_authorizer(struct ceph_connection *con)
1530{
1531 struct ceph_osd *o = con->private;
1532 struct ceph_osd_client *osdc = o->o_osdc;
1533 struct ceph_auth_client *ac = osdc->client->monc.auth;
1534
1535 if (ac->ops->invalidate_authorizer)
1536 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
1537
1538 return ceph_monc_validate_auth(&osdc->client->monc);
1539}
1540
1541const static struct ceph_connection_operations osd_con_ops = {
1542 .get = get_osd_con,
1543 .put = put_osd_con,
1544 .dispatch = dispatch,
1545 .get_authorizer = get_authorizer,
1546 .verify_authorizer_reply = verify_authorizer_reply,
1547 .invalidate_authorizer = invalidate_authorizer,
1548 .alloc_msg = alloc_msg,
1549 .fault = osd_reset,
1550};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
new file mode 100644
index 000000000000..b0759911e7c3
--- /dev/null
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,166 @@
1#ifndef _FS_CEPH_OSD_CLIENT_H
2#define _FS_CEPH_OSD_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/mempool.h>
7#include <linux/rbtree.h>
8
9#include "types.h"
10#include "osdmap.h"
11#include "messenger.h"
12
13struct ceph_msg;
14struct ceph_snap_context;
15struct ceph_osd_request;
16struct ceph_osd_client;
17struct ceph_authorizer;
18
19/*
20 * completion callback for async writepages
21 */
22typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
23 struct ceph_msg *);
24
25/* a given osd we're communicating with */
26struct ceph_osd {
27 atomic_t o_ref;
28 struct ceph_osd_client *o_osdc;
29 int o_osd;
30 int o_incarnation;
31 struct rb_node o_node;
32 struct ceph_connection o_con;
33 struct list_head o_requests;
34 struct list_head o_osd_lru;
35 struct ceph_authorizer *o_authorizer;
36 void *o_authorizer_buf, *o_authorizer_reply_buf;
37 size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
38 unsigned long lru_ttl;
39 int o_marked_for_keepalive;
40 struct list_head o_keepalive_item;
41};
42
43/* an in-flight request */
44struct ceph_osd_request {
45 u64 r_tid; /* unique for this client */
46 struct rb_node r_node;
47 struct list_head r_req_lru_item;
48 struct list_head r_osd_item;
49 struct ceph_osd *r_osd;
50 struct ceph_pg r_pgid;
51
52 struct ceph_connection *r_con_filling_msg;
53
54 struct ceph_msg *r_request, *r_reply;
55 int r_result;
56 int r_flags; /* any additional flags for the osd */
57 u32 r_sent; /* >0 if r_request is sending/sent */
58 int r_got_reply;
59
60 struct ceph_osd_client *r_osdc;
61 struct kref r_kref;
62 bool r_mempool;
63 struct completion r_completion, r_safe_completion;
64 ceph_osdc_callback_t r_callback, r_safe_callback;
65 struct ceph_eversion r_reassert_version;
66 struct list_head r_unsafe_item;
67
68 struct inode *r_inode; /* for use by callbacks */
69 struct writeback_control *r_wbc; /* ditto */
70
71 char r_oid[40]; /* object name */
72 int r_oid_len;
73 unsigned long r_stamp; /* send OR check time */
74 bool r_resend; /* msg send failed, needs retry */
75
76 struct ceph_file_layout r_file_layout;
77 struct ceph_snap_context *r_snapc; /* snap context for writes */
78 unsigned r_num_pages; /* size of page array (follows) */
79 struct page **r_pages; /* pages for data payload */
80 int r_pages_from_pool;
81 int r_own_pages; /* if true, i own page list */
82};
83
84struct ceph_osd_client {
85 struct ceph_client *client;
86
87 struct ceph_osdmap *osdmap; /* current map */
88 struct rw_semaphore map_sem;
89 struct completion map_waiters;
90 u64 last_requested_map;
91
92 struct mutex request_mutex;
93 struct rb_root osds; /* osds */
94 struct list_head osd_lru; /* idle osds */
95 u64 timeout_tid; /* tid of timeout triggering rq */
96 u64 last_tid; /* tid of last request */
97 struct rb_root requests; /* pending requests */
98 struct list_head req_lru; /* pending requests lru */
99 int num_requests;
100 struct delayed_work timeout_work;
101 struct delayed_work osds_timeout_work;
102#ifdef CONFIG_DEBUG_FS
103 struct dentry *debugfs_file;
104#endif
105
106 mempool_t *req_mempool;
107
108 struct ceph_msgpool msgpool_op;
109 struct ceph_msgpool msgpool_op_reply;
110};
111
112extern int ceph_osdc_init(struct ceph_osd_client *osdc,
113 struct ceph_client *client);
114extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
115
116extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
117 struct ceph_msg *msg);
118extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
119 struct ceph_msg *msg);
120
121extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
122 struct ceph_file_layout *layout,
123 struct ceph_vino vino,
124 u64 offset, u64 *len, int op, int flags,
125 struct ceph_snap_context *snapc,
126 int do_sync, u32 truncate_seq,
127 u64 truncate_size,
128 struct timespec *mtime,
129 bool use_mempool, int num_reply);
130
131static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
132{
133 kref_get(&req->r_kref);
134}
135extern void ceph_osdc_release_request(struct kref *kref);
136static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
137{
138 kref_put(&req->r_kref, ceph_osdc_release_request);
139}
140
141extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
142 struct ceph_osd_request *req,
143 bool nofail);
144extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
145 struct ceph_osd_request *req);
146extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
147
148extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
149 struct ceph_vino vino,
150 struct ceph_file_layout *layout,
151 u64 off, u64 *plen,
152 u32 truncate_seq, u64 truncate_size,
153 struct page **pages, int nr_pages);
154
155extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
156 struct ceph_vino vino,
157 struct ceph_file_layout *layout,
158 struct ceph_snap_context *sc,
159 u64 off, u64 len,
160 u32 truncate_seq, u64 truncate_size,
161 struct timespec *mtime,
162 struct page **pages, int nr_pages,
163 int flags, int do_sync, bool nofail);
164
165#endif
166
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
new file mode 100644
index 000000000000..2e2c15eed82a
--- /dev/null
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,1062 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5#include <asm/div64.h>
6
7#include "super.h"
8#include "osdmap.h"
9#include "crush/hash.h"
10#include "crush/mapper.h"
11#include "decode.h"
12
13char *ceph_osdmap_state_str(char *str, int len, int state)
14{
15 int flag = 0;
16
17 if (!len)
18 goto done;
19
20 *str = '\0';
21 if (state) {
22 if (state & CEPH_OSD_EXISTS) {
23 snprintf(str, len, "exists");
24 flag = 1;
25 }
26 if (state & CEPH_OSD_UP) {
27 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
28 "up");
29 flag = 1;
30 }
31 } else {
32 snprintf(str, len, "doesn't exist");
33 }
34done:
35 return str;
36}
37
38/* maps */
39
40static int calc_bits_of(unsigned t)
41{
42 int b = 0;
43 while (t) {
44 t = t >> 1;
45 b++;
46 }
47 return b;
48}
49
50/*
51 * the foo_mask is the smallest value 2^n-1 that is >= foo.
52 */
53static void calc_pg_masks(struct ceph_pg_pool_info *pi)
54{
55 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
56 pi->pgp_num_mask =
57 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
58 pi->lpg_num_mask =
59 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
60 pi->lpgp_num_mask =
61 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
62}
63
64/*
65 * decode crush map
66 */
67static int crush_decode_uniform_bucket(void **p, void *end,
68 struct crush_bucket_uniform *b)
69{
70 dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
71 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
72 b->item_weight = ceph_decode_32(p);
73 return 0;
74bad:
75 return -EINVAL;
76}
77
78static int crush_decode_list_bucket(void **p, void *end,
79 struct crush_bucket_list *b)
80{
81 int j;
82 dout("crush_decode_list_bucket %p to %p\n", *p, end);
83 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
84 if (b->item_weights == NULL)
85 return -ENOMEM;
86 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
87 if (b->sum_weights == NULL)
88 return -ENOMEM;
89 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
90 for (j = 0; j < b->h.size; j++) {
91 b->item_weights[j] = ceph_decode_32(p);
92 b->sum_weights[j] = ceph_decode_32(p);
93 }
94 return 0;
95bad:
96 return -EINVAL;
97}
98
99static int crush_decode_tree_bucket(void **p, void *end,
100 struct crush_bucket_tree *b)
101{
102 int j;
103 dout("crush_decode_tree_bucket %p to %p\n", *p, end);
104 ceph_decode_32_safe(p, end, b->num_nodes, bad);
105 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
106 if (b->node_weights == NULL)
107 return -ENOMEM;
108 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
109 for (j = 0; j < b->num_nodes; j++)
110 b->node_weights[j] = ceph_decode_32(p);
111 return 0;
112bad:
113 return -EINVAL;
114}
115
116static int crush_decode_straw_bucket(void **p, void *end,
117 struct crush_bucket_straw *b)
118{
119 int j;
120 dout("crush_decode_straw_bucket %p to %p\n", *p, end);
121 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
122 if (b->item_weights == NULL)
123 return -ENOMEM;
124 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
125 if (b->straws == NULL)
126 return -ENOMEM;
127 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
128 for (j = 0; j < b->h.size; j++) {
129 b->item_weights[j] = ceph_decode_32(p);
130 b->straws[j] = ceph_decode_32(p);
131 }
132 return 0;
133bad:
134 return -EINVAL;
135}
136
137static struct crush_map *crush_decode(void *pbyval, void *end)
138{
139 struct crush_map *c;
140 int err = -EINVAL;
141 int i, j;
142 void **p = &pbyval;
143 void *start = pbyval;
144 u32 magic;
145
146 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
147
148 c = kzalloc(sizeof(*c), GFP_NOFS);
149 if (c == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 ceph_decode_need(p, end, 4*sizeof(u32), bad);
153 magic = ceph_decode_32(p);
154 if (magic != CRUSH_MAGIC) {
155 pr_err("crush_decode magic %x != current %x\n",
156 (unsigned)magic, (unsigned)CRUSH_MAGIC);
157 goto bad;
158 }
159 c->max_buckets = ceph_decode_32(p);
160 c->max_rules = ceph_decode_32(p);
161 c->max_devices = ceph_decode_32(p);
162
163 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
164 if (c->device_parents == NULL)
165 goto badmem;
166 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
167 if (c->bucket_parents == NULL)
168 goto badmem;
169
170 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
171 if (c->buckets == NULL)
172 goto badmem;
173 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
174 if (c->rules == NULL)
175 goto badmem;
176
177 /* buckets */
178 for (i = 0; i < c->max_buckets; i++) {
179 int size = 0;
180 u32 alg;
181 struct crush_bucket *b;
182
183 ceph_decode_32_safe(p, end, alg, bad);
184 if (alg == 0) {
185 c->buckets[i] = NULL;
186 continue;
187 }
188 dout("crush_decode bucket %d off %x %p to %p\n",
189 i, (int)(*p-start), *p, end);
190
191 switch (alg) {
192 case CRUSH_BUCKET_UNIFORM:
193 size = sizeof(struct crush_bucket_uniform);
194 break;
195 case CRUSH_BUCKET_LIST:
196 size = sizeof(struct crush_bucket_list);
197 break;
198 case CRUSH_BUCKET_TREE:
199 size = sizeof(struct crush_bucket_tree);
200 break;
201 case CRUSH_BUCKET_STRAW:
202 size = sizeof(struct crush_bucket_straw);
203 break;
204 default:
205 err = -EINVAL;
206 goto bad;
207 }
208 BUG_ON(size == 0);
209 b = c->buckets[i] = kzalloc(size, GFP_NOFS);
210 if (b == NULL)
211 goto badmem;
212
213 ceph_decode_need(p, end, 4*sizeof(u32), bad);
214 b->id = ceph_decode_32(p);
215 b->type = ceph_decode_16(p);
216 b->alg = ceph_decode_8(p);
217 b->hash = ceph_decode_8(p);
218 b->weight = ceph_decode_32(p);
219 b->size = ceph_decode_32(p);
220
221 dout("crush_decode bucket size %d off %x %p to %p\n",
222 b->size, (int)(*p-start), *p, end);
223
224 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
225 if (b->items == NULL)
226 goto badmem;
227 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
228 if (b->perm == NULL)
229 goto badmem;
230 b->perm_n = 0;
231
232 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
233 for (j = 0; j < b->size; j++)
234 b->items[j] = ceph_decode_32(p);
235
236 switch (b->alg) {
237 case CRUSH_BUCKET_UNIFORM:
238 err = crush_decode_uniform_bucket(p, end,
239 (struct crush_bucket_uniform *)b);
240 if (err < 0)
241 goto bad;
242 break;
243 case CRUSH_BUCKET_LIST:
244 err = crush_decode_list_bucket(p, end,
245 (struct crush_bucket_list *)b);
246 if (err < 0)
247 goto bad;
248 break;
249 case CRUSH_BUCKET_TREE:
250 err = crush_decode_tree_bucket(p, end,
251 (struct crush_bucket_tree *)b);
252 if (err < 0)
253 goto bad;
254 break;
255 case CRUSH_BUCKET_STRAW:
256 err = crush_decode_straw_bucket(p, end,
257 (struct crush_bucket_straw *)b);
258 if (err < 0)
259 goto bad;
260 break;
261 }
262 }
263
264 /* rules */
265 dout("rule vec is %p\n", c->rules);
266 for (i = 0; i < c->max_rules; i++) {
267 u32 yes;
268 struct crush_rule *r;
269
270 ceph_decode_32_safe(p, end, yes, bad);
271 if (!yes) {
272 dout("crush_decode NO rule %d off %x %p to %p\n",
273 i, (int)(*p-start), *p, end);
274 c->rules[i] = NULL;
275 continue;
276 }
277
278 dout("crush_decode rule %d off %x %p to %p\n",
279 i, (int)(*p-start), *p, end);
280
281 /* len */
282 ceph_decode_32_safe(p, end, yes, bad);
283#if BITS_PER_LONG == 32
284 err = -EINVAL;
285 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
286 goto bad;
287#endif
288 r = c->rules[i] = kmalloc(sizeof(*r) +
289 yes*sizeof(struct crush_rule_step),
290 GFP_NOFS);
291 if (r == NULL)
292 goto badmem;
293 dout(" rule %d is at %p\n", i, r);
294 r->len = yes;
295 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
296 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
297 for (j = 0; j < r->len; j++) {
298 r->steps[j].op = ceph_decode_32(p);
299 r->steps[j].arg1 = ceph_decode_32(p);
300 r->steps[j].arg2 = ceph_decode_32(p);
301 }
302 }
303
304 /* ignore trailing name maps. */
305
306 dout("crush_decode success\n");
307 return c;
308
309badmem:
310 err = -ENOMEM;
311bad:
312 dout("crush_decode fail %d\n", err);
313 crush_destroy(c);
314 return ERR_PTR(err);
315}
316
317/*
318 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
319 * to a set of osds)
320 */
321static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
322{
323 u64 a = *(u64 *)&l;
324 u64 b = *(u64 *)&r;
325
326 if (a < b)
327 return -1;
328 if (a > b)
329 return 1;
330 return 0;
331}
332
333static int __insert_pg_mapping(struct ceph_pg_mapping *new,
334 struct rb_root *root)
335{
336 struct rb_node **p = &root->rb_node;
337 struct rb_node *parent = NULL;
338 struct ceph_pg_mapping *pg = NULL;
339 int c;
340
341 while (*p) {
342 parent = *p;
343 pg = rb_entry(parent, struct ceph_pg_mapping, node);
344 c = pgid_cmp(new->pgid, pg->pgid);
345 if (c < 0)
346 p = &(*p)->rb_left;
347 else if (c > 0)
348 p = &(*p)->rb_right;
349 else
350 return -EEXIST;
351 }
352
353 rb_link_node(&new->node, parent, p);
354 rb_insert_color(&new->node, root);
355 return 0;
356}
357
358static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
359 struct ceph_pg pgid)
360{
361 struct rb_node *n = root->rb_node;
362 struct ceph_pg_mapping *pg;
363 int c;
364
365 while (n) {
366 pg = rb_entry(n, struct ceph_pg_mapping, node);
367 c = pgid_cmp(pgid, pg->pgid);
368 if (c < 0)
369 n = n->rb_left;
370 else if (c > 0)
371 n = n->rb_right;
372 else
373 return pg;
374 }
375 return NULL;
376}
377
378/*
379 * rbtree of pg pool info
380 */
381static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
382{
383 struct rb_node **p = &root->rb_node;
384 struct rb_node *parent = NULL;
385 struct ceph_pg_pool_info *pi = NULL;
386
387 while (*p) {
388 parent = *p;
389 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
390 if (new->id < pi->id)
391 p = &(*p)->rb_left;
392 else if (new->id > pi->id)
393 p = &(*p)->rb_right;
394 else
395 return -EEXIST;
396 }
397
398 rb_link_node(&new->node, parent, p);
399 rb_insert_color(&new->node, root);
400 return 0;
401}
402
403static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
404{
405 struct ceph_pg_pool_info *pi;
406 struct rb_node *n = root->rb_node;
407
408 while (n) {
409 pi = rb_entry(n, struct ceph_pg_pool_info, node);
410 if (id < pi->id)
411 n = n->rb_left;
412 else if (id > pi->id)
413 n = n->rb_right;
414 else
415 return pi;
416 }
417 return NULL;
418}
419
420static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
421{
422 rb_erase(&pi->node, root);
423 kfree(pi->name);
424 kfree(pi);
425}
426
427void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
428{
429 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
430 calc_pg_masks(pi);
431 *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
432 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
433}
434
435static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
436{
437 struct ceph_pg_pool_info *pi;
438 u32 num, len, pool;
439
440 ceph_decode_32_safe(p, end, num, bad);
441 dout(" %d pool names\n", num);
442 while (num--) {
443 ceph_decode_32_safe(p, end, pool, bad);
444 ceph_decode_32_safe(p, end, len, bad);
445 dout(" pool %d len %d\n", pool, len);
446 pi = __lookup_pg_pool(&map->pg_pools, pool);
447 if (pi) {
448 kfree(pi->name);
449 pi->name = kmalloc(len + 1, GFP_NOFS);
450 if (pi->name) {
451 memcpy(pi->name, *p, len);
452 pi->name[len] = '\0';
453 dout(" name is %s\n", pi->name);
454 }
455 }
456 *p += len;
457 }
458 return 0;
459
460bad:
461 return -EINVAL;
462}
463
464/*
465 * osd map
466 */
467void ceph_osdmap_destroy(struct ceph_osdmap *map)
468{
469 dout("osdmap_destroy %p\n", map);
470 if (map->crush)
471 crush_destroy(map->crush);
472 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
473 struct ceph_pg_mapping *pg =
474 rb_entry(rb_first(&map->pg_temp),
475 struct ceph_pg_mapping, node);
476 rb_erase(&pg->node, &map->pg_temp);
477 kfree(pg);
478 }
479 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
480 struct ceph_pg_pool_info *pi =
481 rb_entry(rb_first(&map->pg_pools),
482 struct ceph_pg_pool_info, node);
483 __remove_pg_pool(&map->pg_pools, pi);
484 }
485 kfree(map->osd_state);
486 kfree(map->osd_weight);
487 kfree(map->osd_addr);
488 kfree(map);
489}
490
491/*
492 * adjust max osd value. reallocate arrays.
493 */
494static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
495{
496 u8 *state;
497 struct ceph_entity_addr *addr;
498 u32 *weight;
499
500 state = kcalloc(max, sizeof(*state), GFP_NOFS);
501 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
502 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
503 if (state == NULL || addr == NULL || weight == NULL) {
504 kfree(state);
505 kfree(addr);
506 kfree(weight);
507 return -ENOMEM;
508 }
509
510 /* copy old? */
511 if (map->osd_state) {
512 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
513 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
514 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
515 kfree(map->osd_state);
516 kfree(map->osd_addr);
517 kfree(map->osd_weight);
518 }
519
520 map->osd_state = state;
521 map->osd_weight = weight;
522 map->osd_addr = addr;
523 map->max_osd = max;
524 return 0;
525}
526
527/*
528 * decode a full map.
529 */
530struct ceph_osdmap *osdmap_decode(void **p, void *end)
531{
532 struct ceph_osdmap *map;
533 u16 version;
534 u32 len, max, i;
535 u8 ev;
536 int err = -EINVAL;
537 void *start = *p;
538 struct ceph_pg_pool_info *pi;
539
540 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
541
542 map = kzalloc(sizeof(*map), GFP_NOFS);
543 if (map == NULL)
544 return ERR_PTR(-ENOMEM);
545 map->pg_temp = RB_ROOT;
546
547 ceph_decode_16_safe(p, end, version, bad);
548 if (version > CEPH_OSDMAP_VERSION) {
549 pr_warning("got unknown v %d > %d of osdmap\n", version,
550 CEPH_OSDMAP_VERSION);
551 goto bad;
552 }
553
554 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
555 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
556 map->epoch = ceph_decode_32(p);
557 ceph_decode_copy(p, &map->created, sizeof(map->created));
558 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
559
560 ceph_decode_32_safe(p, end, max, bad);
561 while (max--) {
562 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
563 pi = kzalloc(sizeof(*pi), GFP_NOFS);
564 if (!pi)
565 goto bad;
566 pi->id = ceph_decode_32(p);
567 ev = ceph_decode_8(p); /* encoding version */
568 if (ev > CEPH_PG_POOL_VERSION) {
569 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
570 ev, CEPH_PG_POOL_VERSION);
571 goto bad;
572 }
573 __decode_pool(p, pi);
574 __insert_pg_pool(&map->pg_pools, pi);
575 }
576
577 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
578 goto bad;
579
580 ceph_decode_32_safe(p, end, map->pool_max, bad);
581
582 ceph_decode_32_safe(p, end, map->flags, bad);
583
584 max = ceph_decode_32(p);
585
586 /* (re)alloc osd arrays */
587 err = osdmap_set_max_osd(map, max);
588 if (err < 0)
589 goto bad;
590 dout("osdmap_decode max_osd = %d\n", map->max_osd);
591
592 /* osds */
593 err = -EINVAL;
594 ceph_decode_need(p, end, 3*sizeof(u32) +
595 map->max_osd*(1 + sizeof(*map->osd_weight) +
596 sizeof(*map->osd_addr)), bad);
597 *p += 4; /* skip length field (should match max) */
598 ceph_decode_copy(p, map->osd_state, map->max_osd);
599
600 *p += 4; /* skip length field (should match max) */
601 for (i = 0; i < map->max_osd; i++)
602 map->osd_weight[i] = ceph_decode_32(p);
603
604 *p += 4; /* skip length field (should match max) */
605 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
606 for (i = 0; i < map->max_osd; i++)
607 ceph_decode_addr(&map->osd_addr[i]);
608
609 /* pg_temp */
610 ceph_decode_32_safe(p, end, len, bad);
611 for (i = 0; i < len; i++) {
612 int n, j;
613 struct ceph_pg pgid;
614 struct ceph_pg_mapping *pg;
615
616 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
617 ceph_decode_copy(p, &pgid, sizeof(pgid));
618 n = ceph_decode_32(p);
619 ceph_decode_need(p, end, n * sizeof(u32), bad);
620 err = -ENOMEM;
621 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
622 if (!pg)
623 goto bad;
624 pg->pgid = pgid;
625 pg->len = n;
626 for (j = 0; j < n; j++)
627 pg->osds[j] = ceph_decode_32(p);
628
629 err = __insert_pg_mapping(pg, &map->pg_temp);
630 if (err)
631 goto bad;
632 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
633 }
634
635 /* crush */
636 ceph_decode_32_safe(p, end, len, bad);
637 dout("osdmap_decode crush len %d from off 0x%x\n", len,
638 (int)(*p - start));
639 ceph_decode_need(p, end, len, bad);
640 map->crush = crush_decode(*p, end);
641 *p += len;
642 if (IS_ERR(map->crush)) {
643 err = PTR_ERR(map->crush);
644 map->crush = NULL;
645 goto bad;
646 }
647
648 /* ignore the rest of the map */
649 *p = end;
650
651 dout("osdmap_decode done %p %p\n", *p, end);
652 return map;
653
654bad:
655 dout("osdmap_decode fail\n");
656 ceph_osdmap_destroy(map);
657 return ERR_PTR(err);
658}
659
660/*
661 * decode and apply an incremental map update.
662 */
663struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
664 struct ceph_osdmap *map,
665 struct ceph_messenger *msgr)
666{
667 struct crush_map *newcrush = NULL;
668 struct ceph_fsid fsid;
669 u32 epoch = 0;
670 struct ceph_timespec modified;
671 u32 len, pool;
672 __s32 new_pool_max, new_flags, max;
673 void *start = *p;
674 int err = -EINVAL;
675 u16 version;
676 struct rb_node *rbp;
677
678 ceph_decode_16_safe(p, end, version, bad);
679 if (version > CEPH_OSDMAP_INC_VERSION) {
680 pr_warning("got unknown v %d > %d of inc osdmap\n", version,
681 CEPH_OSDMAP_INC_VERSION);
682 goto bad;
683 }
684
685 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
686 bad);
687 ceph_decode_copy(p, &fsid, sizeof(fsid));
688 epoch = ceph_decode_32(p);
689 BUG_ON(epoch != map->epoch+1);
690 ceph_decode_copy(p, &modified, sizeof(modified));
691 new_pool_max = ceph_decode_32(p);
692 new_flags = ceph_decode_32(p);
693
694 /* full map? */
695 ceph_decode_32_safe(p, end, len, bad);
696 if (len > 0) {
697 dout("apply_incremental full map len %d, %p to %p\n",
698 len, *p, end);
699 return osdmap_decode(p, min(*p+len, end));
700 }
701
702 /* new crush? */
703 ceph_decode_32_safe(p, end, len, bad);
704 if (len > 0) {
705 dout("apply_incremental new crush map len %d, %p to %p\n",
706 len, *p, end);
707 newcrush = crush_decode(*p, min(*p+len, end));
708 if (IS_ERR(newcrush))
709 return ERR_PTR(PTR_ERR(newcrush));
710 }
711
712 /* new flags? */
713 if (new_flags >= 0)
714 map->flags = new_flags;
715 if (new_pool_max >= 0)
716 map->pool_max = new_pool_max;
717
718 ceph_decode_need(p, end, 5*sizeof(u32), bad);
719
720 /* new max? */
721 max = ceph_decode_32(p);
722 if (max >= 0) {
723 err = osdmap_set_max_osd(map, max);
724 if (err < 0)
725 goto bad;
726 }
727
728 map->epoch++;
729 map->modified = map->modified;
730 if (newcrush) {
731 if (map->crush)
732 crush_destroy(map->crush);
733 map->crush = newcrush;
734 newcrush = NULL;
735 }
736
737 /* new_pool */
738 ceph_decode_32_safe(p, end, len, bad);
739 while (len--) {
740 __u8 ev;
741 struct ceph_pg_pool_info *pi;
742
743 ceph_decode_32_safe(p, end, pool, bad);
744 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
745 ev = ceph_decode_8(p); /* encoding version */
746 if (ev > CEPH_PG_POOL_VERSION) {
747 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
748 ev, CEPH_PG_POOL_VERSION);
749 goto bad;
750 }
751 pi = __lookup_pg_pool(&map->pg_pools, pool);
752 if (!pi) {
753 pi = kzalloc(sizeof(*pi), GFP_NOFS);
754 if (!pi) {
755 err = -ENOMEM;
756 goto bad;
757 }
758 pi->id = pool;
759 __insert_pg_pool(&map->pg_pools, pi);
760 }
761 __decode_pool(p, pi);
762 }
763 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
764 goto bad;
765
766 /* old_pool */
767 ceph_decode_32_safe(p, end, len, bad);
768 while (len--) {
769 struct ceph_pg_pool_info *pi;
770
771 ceph_decode_32_safe(p, end, pool, bad);
772 pi = __lookup_pg_pool(&map->pg_pools, pool);
773 if (pi)
774 __remove_pg_pool(&map->pg_pools, pi);
775 }
776
777 /* new_up */
778 err = -EINVAL;
779 ceph_decode_32_safe(p, end, len, bad);
780 while (len--) {
781 u32 osd;
782 struct ceph_entity_addr addr;
783 ceph_decode_32_safe(p, end, osd, bad);
784 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
785 ceph_decode_addr(&addr);
786 pr_info("osd%d up\n", osd);
787 BUG_ON(osd >= map->max_osd);
788 map->osd_state[osd] |= CEPH_OSD_UP;
789 map->osd_addr[osd] = addr;
790 }
791
792 /* new_down */
793 ceph_decode_32_safe(p, end, len, bad);
794 while (len--) {
795 u32 osd;
796 ceph_decode_32_safe(p, end, osd, bad);
797 (*p)++; /* clean flag */
798 pr_info("osd%d down\n", osd);
799 if (osd < map->max_osd)
800 map->osd_state[osd] &= ~CEPH_OSD_UP;
801 }
802
803 /* new_weight */
804 ceph_decode_32_safe(p, end, len, bad);
805 while (len--) {
806 u32 osd, off;
807 ceph_decode_need(p, end, sizeof(u32)*2, bad);
808 osd = ceph_decode_32(p);
809 off = ceph_decode_32(p);
810 pr_info("osd%d weight 0x%x %s\n", osd, off,
811 off == CEPH_OSD_IN ? "(in)" :
812 (off == CEPH_OSD_OUT ? "(out)" : ""));
813 if (osd < map->max_osd)
814 map->osd_weight[osd] = off;
815 }
816
817 /* new_pg_temp */
818 rbp = rb_first(&map->pg_temp);
819 ceph_decode_32_safe(p, end, len, bad);
820 while (len--) {
821 struct ceph_pg_mapping *pg;
822 int j;
823 struct ceph_pg pgid;
824 u32 pglen;
825 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
826 ceph_decode_copy(p, &pgid, sizeof(pgid));
827 pglen = ceph_decode_32(p);
828
829 /* remove any? */
830 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
831 node)->pgid, pgid) <= 0) {
832 struct rb_node *cur = rbp;
833 rbp = rb_next(rbp);
834 dout(" removed pg_temp %llx\n",
835 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
836 node)->pgid);
837 rb_erase(cur, &map->pg_temp);
838 }
839
840 if (pglen) {
841 /* insert */
842 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
843 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
844 if (!pg) {
845 err = -ENOMEM;
846 goto bad;
847 }
848 pg->pgid = pgid;
849 pg->len = pglen;
850 for (j = 0; j < pglen; j++)
851 pg->osds[j] = ceph_decode_32(p);
852 err = __insert_pg_mapping(pg, &map->pg_temp);
853 if (err)
854 goto bad;
855 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
856 pglen);
857 }
858 }
859 while (rbp) {
860 struct rb_node *cur = rbp;
861 rbp = rb_next(rbp);
862 dout(" removed pg_temp %llx\n",
863 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
864 node)->pgid);
865 rb_erase(cur, &map->pg_temp);
866 }
867
868 /* ignore the rest */
869 *p = end;
870 return map;
871
872bad:
873 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
874 epoch, (int)(*p - start), *p, start, end);
875 print_hex_dump(KERN_DEBUG, "osdmap: ",
876 DUMP_PREFIX_OFFSET, 16, 1,
877 start, end - start, true);
878 if (newcrush)
879 crush_destroy(newcrush);
880 return ERR_PTR(err);
881}
882
883
884
885
886/*
887 * calculate file layout from given offset, length.
888 * fill in correct oid, logical length, and object extent
889 * offset, length.
890 *
891 * for now, we write only a single su, until we can
892 * pass a stride back to the caller.
893 */
894void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
895 u64 off, u64 *plen,
896 u64 *ono,
897 u64 *oxoff, u64 *oxlen)
898{
899 u32 osize = le32_to_cpu(layout->fl_object_size);
900 u32 su = le32_to_cpu(layout->fl_stripe_unit);
901 u32 sc = le32_to_cpu(layout->fl_stripe_count);
902 u32 bl, stripeno, stripepos, objsetno;
903 u32 su_per_object;
904 u64 t, su_offset;
905
906 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
907 osize, su);
908 su_per_object = osize / su;
909 dout("osize %u / su %u = su_per_object %u\n", osize, su,
910 su_per_object);
911
912 BUG_ON((su & ~PAGE_MASK) != 0);
913 /* bl = *off / su; */
914 t = off;
915 do_div(t, su);
916 bl = t;
917 dout("off %llu / su %u = bl %u\n", off, su, bl);
918
919 stripeno = bl / sc;
920 stripepos = bl % sc;
921 objsetno = stripeno / su_per_object;
922
923 *ono = objsetno * sc + stripepos;
924 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
925
926 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
927 t = off;
928 su_offset = do_div(t, su);
929 *oxoff = su_offset + (stripeno % su_per_object) * su;
930
931 /*
932 * Calculate the length of the extent being written to the selected
933 * object. This is the minimum of the full length requested (plen) or
934 * the remainder of the current stripe being written to.
935 */
936 *oxlen = min_t(u64, *plen, su - su_offset);
937 *plen = *oxlen;
938
939 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
940}
941
942/*
943 * calculate an object layout (i.e. pgid) from an oid,
944 * file_layout, and osdmap
945 */
946int ceph_calc_object_layout(struct ceph_object_layout *ol,
947 const char *oid,
948 struct ceph_file_layout *fl,
949 struct ceph_osdmap *osdmap)
950{
951 unsigned num, num_mask;
952 struct ceph_pg pgid;
953 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
954 int poolid = le32_to_cpu(fl->fl_pg_pool);
955 struct ceph_pg_pool_info *pool;
956 unsigned ps;
957
958 BUG_ON(!osdmap);
959
960 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
961 if (!pool)
962 return -EIO;
963 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
964 if (preferred >= 0) {
965 ps += preferred;
966 num = le32_to_cpu(pool->v.lpg_num);
967 num_mask = pool->lpg_num_mask;
968 } else {
969 num = le32_to_cpu(pool->v.pg_num);
970 num_mask = pool->pg_num_mask;
971 }
972
973 pgid.ps = cpu_to_le16(ps);
974 pgid.preferred = cpu_to_le16(preferred);
975 pgid.pool = fl->fl_pg_pool;
976 if (preferred >= 0)
977 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
978 (int)preferred);
979 else
980 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
981
982 ol->ol_pgid = pgid;
983 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
984 return 0;
985}
986
987/*
988 * Calculate raw osd vector for the given pgid. Return pointer to osd
989 * array, or NULL on failure.
990 */
991static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
992 int *osds, int *num)
993{
994 struct ceph_pg_mapping *pg;
995 struct ceph_pg_pool_info *pool;
996 int ruleno;
997 unsigned poolid, ps, pps;
998 int preferred;
999
1000 /* pg_temp? */
1001 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1002 if (pg) {
1003 *num = pg->len;
1004 return pg->osds;
1005 }
1006
1007 /* crush */
1008 poolid = le32_to_cpu(pgid.pool);
1009 ps = le16_to_cpu(pgid.ps);
1010 preferred = (s16)le16_to_cpu(pgid.preferred);
1011
1012 /* don't forcefeed bad device ids to crush */
1013 if (preferred >= osdmap->max_osd ||
1014 preferred >= osdmap->crush->max_devices)
1015 preferred = -1;
1016
1017 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1018 if (!pool)
1019 return NULL;
1020 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
1021 pool->v.type, pool->v.size);
1022 if (ruleno < 0) {
1023 pr_err("no crush rule pool %d type %d size %d\n",
1024 poolid, pool->v.type, pool->v.size);
1025 return NULL;
1026 }
1027
1028 if (preferred >= 0)
1029 pps = ceph_stable_mod(ps,
1030 le32_to_cpu(pool->v.lpgp_num),
1031 pool->lpgp_num_mask);
1032 else
1033 pps = ceph_stable_mod(ps,
1034 le32_to_cpu(pool->v.pgp_num),
1035 pool->pgp_num_mask);
1036 pps += poolid;
1037 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1038 min_t(int, pool->v.size, *num),
1039 preferred, osdmap->osd_weight);
1040 return osds;
1041}
1042
1043/*
1044 * Return primary osd for given pgid, or -1 if none.
1045 */
1046int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1047{
1048 int rawosds[10], *osds;
1049 int i, num = ARRAY_SIZE(rawosds);
1050
1051 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1052 if (!osds)
1053 return -1;
1054
1055 /* primary is first up osd */
1056 for (i = 0; i < num; i++)
1057 if (ceph_osd_is_up(osdmap, osds[i])) {
1058 return osds[i];
1059 break;
1060 }
1061 return -1;
1062}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
new file mode 100644
index 000000000000..8bc9f1e4f562
--- /dev/null
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,126 @@
1#ifndef _FS_CEPH_OSDMAP_H
2#define _FS_CEPH_OSDMAP_H
3
4#include <linux/rbtree.h>
5#include "types.h"
6#include "ceph_fs.h"
7#include "crush/crush.h"
8
9/*
10 * The osd map describes the current membership of the osd cluster and
11 * specifies the mapping of objects to placement groups and placement
12 * groups to (sets of) osds. That is, it completely specifies the
13 * (desired) distribution of all data objects in the system at some
14 * point in time.
15 *
16 * Each map version is identified by an epoch, which increases monotonically.
17 *
18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map.
20 */
21struct ceph_pg_pool_info {
22 struct rb_node node;
23 int id;
24 struct ceph_pg_pool v;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
26 char *name;
27};
28
29struct ceph_pg_mapping {
30 struct rb_node node;
31 struct ceph_pg pgid;
32 int len;
33 int osds[];
34};
35
36struct ceph_osdmap {
37 struct ceph_fsid fsid;
38 u32 epoch;
39 u32 mkfs_epoch;
40 struct ceph_timespec created, modified;
41
42 u32 flags; /* CEPH_OSDMAP_* */
43
44 u32 max_osd; /* size of osd_state, _offload, _addr arrays */
45 u8 *osd_state; /* CEPH_OSD_* */
46 u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
47 struct ceph_entity_addr *osd_addr;
48
49 struct rb_root pg_temp;
50 struct rb_root pg_pools;
51 u32 pool_max;
52
53 /* the CRUSH map specifies the mapping of placement groups to
54 * the list of osds that store+replicate them. */
55 struct crush_map *crush;
56};
57
58/*
59 * file layout helpers
60 */
61#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
62#define ceph_file_layout_stripe_count(l) \
63 ((__s32)le32_to_cpu((l).fl_stripe_count))
64#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
65#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
66#define ceph_file_layout_object_su(l) \
67 ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
68#define ceph_file_layout_pg_preferred(l) \
69 ((__s32)le32_to_cpu((l).fl_pg_preferred))
70#define ceph_file_layout_pg_pool(l) \
71 ((__s32)le32_to_cpu((l).fl_pg_pool))
72
73static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
74{
75 return le32_to_cpu(l->fl_stripe_unit) *
76 le32_to_cpu(l->fl_stripe_count);
77}
78
79/* "period" == bytes before i start on a new set of objects */
80static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
81{
82 return le32_to_cpu(l->fl_object_size) *
83 le32_to_cpu(l->fl_stripe_count);
84}
85
86
87static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
88{
89 return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
90}
91
92static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
93{
94 return map && (map->flags & flag);
95}
96
97extern char *ceph_osdmap_state_str(char *str, int len, int state);
98
99static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
100 int osd)
101{
102 if (osd >= map->max_osd)
103 return NULL;
104 return &map->osd_addr[osd];
105}
106
107extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
108extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
109 struct ceph_osdmap *map,
110 struct ceph_messenger *msgr);
111extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
112
113/* calculate mapping of a file extent to an object */
114extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
115 u64 off, u64 *plen,
116 u64 *bno, u64 *oxoff, u64 *oxlen);
117
118/* calculate mapping of object to a placement group */
119extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
120 const char *oid,
121 struct ceph_file_layout *fl,
122 struct ceph_osdmap *osdmap);
123extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
124 struct ceph_pg pgid);
125
126#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
new file mode 100644
index 000000000000..5f8dbf7c745a
--- /dev/null
+++ b/fs/ceph/pagelist.c
@@ -0,0 +1,55 @@
1
2#include <linux/gfp.h>
3#include <linux/pagemap.h>
4#include <linux/highmem.h>
5
6#include "pagelist.h"
7
8int ceph_pagelist_release(struct ceph_pagelist *pl)
9{
10 if (pl->mapped_tail)
11 kunmap(pl->mapped_tail);
12 while (!list_empty(&pl->head)) {
13 struct page *page = list_first_entry(&pl->head, struct page,
14 lru);
15 list_del(&page->lru);
16 __free_page(page);
17 }
18 return 0;
19}
20
21static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
22{
23 struct page *page = alloc_page(GFP_NOFS);
24 if (!page)
25 return -ENOMEM;
26 pl->room += PAGE_SIZE;
27 list_add_tail(&page->lru, &pl->head);
28 if (pl->mapped_tail)
29 kunmap(pl->mapped_tail);
30 pl->mapped_tail = kmap(page);
31 return 0;
32}
33
34int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
35{
36 while (pl->room < len) {
37 size_t bit = pl->room;
38 int ret;
39
40 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
41 buf, bit);
42 pl->length += bit;
43 pl->room -= bit;
44 buf += bit;
45 len -= bit;
46 ret = ceph_pagelist_addpage(pl);
47 if (ret)
48 return ret;
49 }
50
51 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
52 pl->length += len;
53 pl->room -= len;
54 return 0;
55}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
new file mode 100644
index 000000000000..e8a4187e1087
--- /dev/null
+++ b/fs/ceph/pagelist.h
@@ -0,0 +1,54 @@
1#ifndef __FS_CEPH_PAGELIST_H
2#define __FS_CEPH_PAGELIST_H
3
4#include <linux/list.h>
5
6struct ceph_pagelist {
7 struct list_head head;
8 void *mapped_tail;
9 size_t length;
10 size_t room;
11};
12
13static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
14{
15 INIT_LIST_HEAD(&pl->head);
16 pl->mapped_tail = NULL;
17 pl->length = 0;
18 pl->room = 0;
19}
20extern int ceph_pagelist_release(struct ceph_pagelist *pl);
21
22extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
23
24static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
25{
26 __le64 ev = cpu_to_le64(v);
27 return ceph_pagelist_append(pl, &ev, sizeof(ev));
28}
29static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
30{
31 __le32 ev = cpu_to_le32(v);
32 return ceph_pagelist_append(pl, &ev, sizeof(ev));
33}
34static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
35{
36 __le16 ev = cpu_to_le16(v);
37 return ceph_pagelist_append(pl, &ev, sizeof(ev));
38}
39static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
40{
41 return ceph_pagelist_append(pl, &v, 1);
42}
43static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
44 char *s, size_t len)
45{
46 int ret = ceph_pagelist_encode_32(pl, len);
47 if (ret)
48 return ret;
49 if (len)
50 return ceph_pagelist_append(pl, s, len);
51 return 0;
52}
53
54#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 000000000000..a1fc1d017b58
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,376 @@
1#ifndef __RADOS_H
2#define __RADOS_H
3
4/*
5 * Data types for the Ceph distributed object storage layer RADOS
6 * (Reliable Autonomic Distributed Object Store).
7 */
8
9#include "msgr.h"
10
11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 5
15#define CEPH_OSDMAP_INC_VERSION_EXT 5
16#define CEPH_OSDMAP_VERSION 5
17#define CEPH_OSDMAP_VERSION_EXT 5
18
19/*
20 * fs id
21 */
22struct ceph_fsid {
23 unsigned char fsid[16];
24};
25
26static inline int ceph_fsid_compare(const struct ceph_fsid *a,
27 const struct ceph_fsid *b)
28{
29 return memcmp(a, b, sizeof(*a));
30}
31
32/*
33 * ino, object, etc.
34 */
35typedef __le64 ceph_snapid_t;
36#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
37#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
38#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
39
40struct ceph_timespec {
41 __le32 tv_sec;
42 __le32 tv_nsec;
43} __attribute__ ((packed));
44
45
46/*
47 * object layout - how objects are mapped into PGs
48 */
49#define CEPH_OBJECT_LAYOUT_HASH 1
50#define CEPH_OBJECT_LAYOUT_LINEAR 2
51#define CEPH_OBJECT_LAYOUT_HASHINO 3
52
53/*
54 * pg layout -- how PGs are mapped onto (sets of) OSDs
55 */
56#define CEPH_PG_LAYOUT_CRUSH 0
57#define CEPH_PG_LAYOUT_HASH 1
58#define CEPH_PG_LAYOUT_LINEAR 2
59#define CEPH_PG_LAYOUT_HYBRID 3
60
61
62/*
63 * placement group.
64 * we encode this into one __le64.
65 */
66struct ceph_pg {
67 __le16 preferred; /* preferred primary osd */
68 __le16 ps; /* placement seed */
69 __le32 pool; /* object pool */
70} __attribute__ ((packed));
71
72/*
73 * pg_pool is a set of pgs storing a pool of objects
74 *
75 * pg_num -- base number of pseudorandomly placed pgs
76 *
77 * pgp_num -- effective number when calculating pg placement. this
78 * is used for pg_num increases. new pgs result in data being "split"
79 * into new pgs. for this to proceed smoothly, new pgs are intiially
80 * colocated with their parents; that is, pgp_num doesn't increase
81 * until the new pgs have successfully split. only _then_ are the new
82 * pgs placed independently.
83 *
84 * lpg_num -- localized pg count (per device). replicas are randomly
85 * selected.
86 *
87 * lpgp_num -- as above.
88 */
89#define CEPH_PG_TYPE_REP 1
90#define CEPH_PG_TYPE_RAID4 2
91#define CEPH_PG_POOL_VERSION 2
92struct ceph_pg_pool {
93 __u8 type; /* CEPH_PG_TYPE_* */
94 __u8 size; /* number of osds in each pg */
95 __u8 crush_ruleset; /* crush placement rule */
96 __u8 object_hash; /* hash mapping object name to ps */
97 __le32 pg_num, pgp_num; /* number of pg's */
98 __le32 lpg_num, lpgp_num; /* number of localized pg's */
99 __le32 last_change; /* most recent epoch changed */
100 __le64 snap_seq; /* seq for per-pool snapshot */
101 __le32 snap_epoch; /* epoch of last snap */
102 __le32 num_snaps;
103 __le32 num_removed_snap_intervals;
104 __le64 uid;
105} __attribute__ ((packed));
106
107/*
108 * stable_mod func is used to control number of placement groups.
109 * similar to straight-up modulo, but produces a stable mapping as b
110 * increases over time. b is the number of bins, and bmask is the
111 * containing power of 2 minus 1.
112 *
113 * b <= bmask and bmask=(2**n)-1
114 * e.g., b=12 -> bmask=15, b=123 -> bmask=127
115 */
116static inline int ceph_stable_mod(int x, int b, int bmask)
117{
118 if ((x & bmask) < b)
119 return x & bmask;
120 else
121 return x & (bmask >> 1);
122}
123
124/*
125 * object layout - how a given object should be stored.
126 */
127struct ceph_object_layout {
128 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
129 __le32 ol_stripe_unit; /* for per-object parity, if any */
130} __attribute__ ((packed));
131
132/*
133 * compound epoch+version, used by storage layer to serialize mutations
134 */
135struct ceph_eversion {
136 __le32 epoch;
137 __le64 version;
138} __attribute__ ((packed));
139
140/*
141 * osd map bits
142 */
143
144/* status bits */
145#define CEPH_OSD_EXISTS 1
146#define CEPH_OSD_UP 2
147
148/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
149#define CEPH_OSD_IN 0x10000
150#define CEPH_OSD_OUT 0
151
152
153/*
154 * osd map flag bits
155 */
156#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
157#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
158#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
159#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
160#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
161
162/*
163 * osd ops
164 */
165#define CEPH_OSD_OP_MODE 0xf000
166#define CEPH_OSD_OP_MODE_RD 0x1000
167#define CEPH_OSD_OP_MODE_WR 0x2000
168#define CEPH_OSD_OP_MODE_RMW 0x3000
169#define CEPH_OSD_OP_MODE_SUB 0x4000
170
171#define CEPH_OSD_OP_TYPE 0x0f00
172#define CEPH_OSD_OP_TYPE_LOCK 0x0100
173#define CEPH_OSD_OP_TYPE_DATA 0x0200
174#define CEPH_OSD_OP_TYPE_ATTR 0x0300
175#define CEPH_OSD_OP_TYPE_EXEC 0x0400
176#define CEPH_OSD_OP_TYPE_PG 0x0500
177
178enum {
179 /** data **/
180 /* read */
181 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
182 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
183
184 /* fancy read */
185 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
186
187 /* write */
188 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
189 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
190 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
191 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
192 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
193
194 /* fancy write */
195 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
196 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
197 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
198 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
199
200 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
201 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
202 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
203
204 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
205
206 /** attrs **/
207 /* read */
208 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
209 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
210
211 /* write */
212 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
213 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
214 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
215 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
216
217 /** subop **/
218 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
219 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
220 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
221 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
222 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
223
224 /** lock **/
225 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
226 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
227 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
228 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
229 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
230 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
231
232 /** exec **/
233 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
234
235 /** pg **/
236 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
237};
238
239static inline int ceph_osd_op_type_lock(int op)
240{
241 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
242}
243static inline int ceph_osd_op_type_data(int op)
244{
245 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
246}
247static inline int ceph_osd_op_type_attr(int op)
248{
249 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
250}
251static inline int ceph_osd_op_type_exec(int op)
252{
253 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
254}
255static inline int ceph_osd_op_type_pg(int op)
256{
257 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
258}
259
260static inline int ceph_osd_op_mode_subop(int op)
261{
262 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
263}
264static inline int ceph_osd_op_mode_read(int op)
265{
266 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
267}
268static inline int ceph_osd_op_mode_modify(int op)
269{
270 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
271}
272
273#define CEPH_OSD_TMAP_HDR 'h'
274#define CEPH_OSD_TMAP_SET 's'
275#define CEPH_OSD_TMAP_RM 'r'
276
277extern const char *ceph_osd_op_name(int op);
278
279
280/*
281 * osd op flags
282 *
283 * An op may be READ, WRITE, or READ|WRITE.
284 */
285enum {
286 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
287 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
288 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
289 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
290 CEPH_OSD_FLAG_READ = 16, /* op may read */
291 CEPH_OSD_FLAG_WRITE = 32, /* op may write */
292 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
293 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
294 CEPH_OSD_FLAG_BALANCE_READS = 256,
295 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
296 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
297 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
298};
299
300enum {
301 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
302};
303
304#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
305#define EBLACKLISTED ESHUTDOWN /* blacklisted */
306
307/*
308 * an individual object operation. each may be accompanied by some data
309 * payload
310 */
311struct ceph_osd_op {
312 __le16 op; /* CEPH_OSD_OP_* */
313 __le32 flags; /* CEPH_OSD_FLAG_* */
314 union {
315 struct {
316 __le64 offset, length;
317 __le64 truncate_size;
318 __le32 truncate_seq;
319 } __attribute__ ((packed)) extent;
320 struct {
321 __le32 name_len;
322 __le32 value_len;
323 } __attribute__ ((packed)) xattr;
324 struct {
325 __u8 class_len;
326 __u8 method_len;
327 __u8 argc;
328 __le32 indata_len;
329 } __attribute__ ((packed)) cls;
330 struct {
331 __le64 cookie, count;
332 } __attribute__ ((packed)) pgls;
333 };
334 __le32 payload_len;
335} __attribute__ ((packed));
336
337/*
338 * osd request message header. each request may include multiple
339 * ceph_osd_op object operations.
340 */
341struct ceph_osd_request_head {
342 __le32 client_inc; /* client incarnation */
343 struct ceph_object_layout layout; /* pgid */
344 __le32 osdmap_epoch; /* client's osdmap epoch */
345
346 __le32 flags;
347
348 struct ceph_timespec mtime; /* for mutations only */
349 struct ceph_eversion reassert_version; /* if we are replaying op */
350
351 __le32 object_len; /* length of object name */
352
353 __le64 snapid; /* snapid to read */
354 __le64 snap_seq; /* writer's snap context */
355 __le32 num_snaps;
356
357 __le16 num_ops;
358 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
359} __attribute__ ((packed));
360
361struct ceph_osd_reply_head {
362 __le32 client_inc; /* client incarnation */
363 __le32 flags;
364 struct ceph_object_layout layout;
365 __le32 osdmap_epoch;
366 struct ceph_eversion reassert_version; /* for replaying uncommitted */
367
368 __le32 result; /* result code */
369
370 __le32 object_len; /* length of object name */
371 __le32 num_ops;
372 struct ceph_osd_op ops[0]; /* ops[], object */
373} __attribute__ ((packed));
374
375
376#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 000000000000..2b881262ef67
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,907 @@
1#include "ceph_debug.h"
2
3#include <linux/sort.h>
4#include <linux/slab.h>
5
6#include "super.h"
7#include "decode.h"
8
9/*
10 * Snapshots in ceph are driven in large part by cooperation from the
11 * client. In contrast to local file systems or file servers that
12 * implement snapshots at a single point in the system, ceph's
13 * distributed access to storage requires clients to help decide
14 * whether a write logically occurs before or after a recently created
15 * snapshot.
16 *
17 * This provides a perfect instantanous client-wide snapshot. Between
18 * clients, however, snapshots may appear to be applied at slightly
19 * different points in time, depending on delays in delivering the
20 * snapshot notification.
21 *
22 * Snapshots are _not_ file system-wide. Instead, each snapshot
23 * applies to the subdirectory nested beneath some directory. This
24 * effectively divides the hierarchy into multiple "realms," where all
25 * of the files contained by each realm share the same set of
26 * snapshots. An individual realm's snap set contains snapshots
27 * explicitly created on that realm, as well as any snaps in its
28 * parent's snap set _after_ the point at which the parent became it's
29 * parent (due to, say, a rename). Similarly, snaps from prior parents
30 * during the time intervals during which they were the parent are included.
31 *
32 * The client is spared most of this detail, fortunately... it must only
33 * maintains a hierarchy of realms reflecting the current parent/child
34 * realm relationship, and for each realm has an explicit list of snaps
35 * inherited from prior parents.
36 *
37 * A snap_realm struct is maintained for realms containing every inode
38 * with an open cap in the system. (The needed snap realm information is
39 * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
40 * version number is used to ensure that as realm parameters change (new
41 * snapshot, new parent, etc.) the client's realm hierarchy is updated.
42 *
43 * The realm hierarchy drives the generation of a 'snap context' for each
44 * realm, which simply lists the resulting set of snaps for the realm. This
45 * is attached to any writes sent to OSDs.
46 */
47/*
48 * Unfortunately error handling is a bit mixed here. If we get a snap
49 * update, but don't have enough memory to update our realm hierarchy,
50 * it's not clear what we can do about it (besides complaining to the
51 * console).
52 */
53
54
55/*
56 * increase ref count for the realm
57 *
58 * caller must hold snap_rwsem for write.
59 */
60void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
61 struct ceph_snap_realm *realm)
62{
63 dout("get_realm %p %d -> %d\n", realm,
64 atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
65 /*
66 * since we _only_ increment realm refs or empty the empty
67 * list with snap_rwsem held, adjusting the empty list here is
68 * safe. we do need to protect against concurrent empty list
69 * additions, however.
70 */
71 if (atomic_read(&realm->nref) == 0) {
72 spin_lock(&mdsc->snap_empty_lock);
73 list_del_init(&realm->empty_item);
74 spin_unlock(&mdsc->snap_empty_lock);
75 }
76
77 atomic_inc(&realm->nref);
78}
79
80static void __insert_snap_realm(struct rb_root *root,
81 struct ceph_snap_realm *new)
82{
83 struct rb_node **p = &root->rb_node;
84 struct rb_node *parent = NULL;
85 struct ceph_snap_realm *r = NULL;
86
87 while (*p) {
88 parent = *p;
89 r = rb_entry(parent, struct ceph_snap_realm, node);
90 if (new->ino < r->ino)
91 p = &(*p)->rb_left;
92 else if (new->ino > r->ino)
93 p = &(*p)->rb_right;
94 else
95 BUG();
96 }
97
98 rb_link_node(&new->node, parent, p);
99 rb_insert_color(&new->node, root);
100}
101
102/*
103 * create and get the realm rooted at @ino and bump its ref count.
104 *
105 * caller must hold snap_rwsem for write.
106 */
107static struct ceph_snap_realm *ceph_create_snap_realm(
108 struct ceph_mds_client *mdsc,
109 u64 ino)
110{
111 struct ceph_snap_realm *realm;
112
113 realm = kzalloc(sizeof(*realm), GFP_NOFS);
114 if (!realm)
115 return ERR_PTR(-ENOMEM);
116
117 atomic_set(&realm->nref, 0); /* tree does not take a ref */
118 realm->ino = ino;
119 INIT_LIST_HEAD(&realm->children);
120 INIT_LIST_HEAD(&realm->child_item);
121 INIT_LIST_HEAD(&realm->empty_item);
122 INIT_LIST_HEAD(&realm->inodes_with_caps);
123 spin_lock_init(&realm->inodes_with_caps_lock);
124 __insert_snap_realm(&mdsc->snap_realms, realm);
125 dout("create_snap_realm %llx %p\n", realm->ino, realm);
126 return realm;
127}
128
129/*
130 * lookup the realm rooted at @ino.
131 *
132 * caller must hold snap_rwsem for write.
133 */
134struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
135 u64 ino)
136{
137 struct rb_node *n = mdsc->snap_realms.rb_node;
138 struct ceph_snap_realm *r;
139
140 while (n) {
141 r = rb_entry(n, struct ceph_snap_realm, node);
142 if (ino < r->ino)
143 n = n->rb_left;
144 else if (ino > r->ino)
145 n = n->rb_right;
146 else {
147 dout("lookup_snap_realm %llx %p\n", r->ino, r);
148 return r;
149 }
150 }
151 return NULL;
152}
153
154static void __put_snap_realm(struct ceph_mds_client *mdsc,
155 struct ceph_snap_realm *realm);
156
157/*
158 * called with snap_rwsem (write)
159 */
160static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
161 struct ceph_snap_realm *realm)
162{
163 dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
164
165 rb_erase(&realm->node, &mdsc->snap_realms);
166
167 if (realm->parent) {
168 list_del_init(&realm->child_item);
169 __put_snap_realm(mdsc, realm->parent);
170 }
171
172 kfree(realm->prior_parent_snaps);
173 kfree(realm->snaps);
174 ceph_put_snap_context(realm->cached_context);
175 kfree(realm);
176}
177
178/*
179 * caller holds snap_rwsem (write)
180 */
181static void __put_snap_realm(struct ceph_mds_client *mdsc,
182 struct ceph_snap_realm *realm)
183{
184 dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
185 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
186 if (atomic_dec_and_test(&realm->nref))
187 __destroy_snap_realm(mdsc, realm);
188}
189
190/*
191 * caller needn't hold any locks
192 */
193void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
194 struct ceph_snap_realm *realm)
195{
196 dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
197 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
198 if (!atomic_dec_and_test(&realm->nref))
199 return;
200
201 if (down_write_trylock(&mdsc->snap_rwsem)) {
202 __destroy_snap_realm(mdsc, realm);
203 up_write(&mdsc->snap_rwsem);
204 } else {
205 spin_lock(&mdsc->snap_empty_lock);
206 list_add(&mdsc->snap_empty, &realm->empty_item);
207 spin_unlock(&mdsc->snap_empty_lock);
208 }
209}
210
211/*
212 * Clean up any realms whose ref counts have dropped to zero. Note
213 * that this does not include realms who were created but not yet
214 * used.
215 *
216 * Called under snap_rwsem (write)
217 */
218static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
219{
220 struct ceph_snap_realm *realm;
221
222 spin_lock(&mdsc->snap_empty_lock);
223 while (!list_empty(&mdsc->snap_empty)) {
224 realm = list_first_entry(&mdsc->snap_empty,
225 struct ceph_snap_realm, empty_item);
226 list_del(&realm->empty_item);
227 spin_unlock(&mdsc->snap_empty_lock);
228 __destroy_snap_realm(mdsc, realm);
229 spin_lock(&mdsc->snap_empty_lock);
230 }
231 spin_unlock(&mdsc->snap_empty_lock);
232}
233
234void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
235{
236 down_write(&mdsc->snap_rwsem);
237 __cleanup_empty_realms(mdsc);
238 up_write(&mdsc->snap_rwsem);
239}
240
241/*
242 * adjust the parent realm of a given @realm. adjust child list, and parent
243 * pointers, and ref counts appropriately.
244 *
245 * return true if parent was changed, 0 if unchanged, <0 on error.
246 *
247 * caller must hold snap_rwsem for write.
248 */
249static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
250 struct ceph_snap_realm *realm,
251 u64 parentino)
252{
253 struct ceph_snap_realm *parent;
254
255 if (realm->parent_ino == parentino)
256 return 0;
257
258 parent = ceph_lookup_snap_realm(mdsc, parentino);
259 if (!parent) {
260 parent = ceph_create_snap_realm(mdsc, parentino);
261 if (IS_ERR(parent))
262 return PTR_ERR(parent);
263 }
264 dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
265 realm->ino, realm, realm->parent_ino, realm->parent,
266 parentino, parent);
267 if (realm->parent) {
268 list_del_init(&realm->child_item);
269 ceph_put_snap_realm(mdsc, realm->parent);
270 }
271 realm->parent_ino = parentino;
272 realm->parent = parent;
273 ceph_get_snap_realm(mdsc, parent);
274 list_add(&realm->child_item, &parent->children);
275 return 1;
276}
277
278
279static int cmpu64_rev(const void *a, const void *b)
280{
281 if (*(u64 *)a < *(u64 *)b)
282 return 1;
283 if (*(u64 *)a > *(u64 *)b)
284 return -1;
285 return 0;
286}
287
288/*
289 * build the snap context for a given realm.
290 */
291static int build_snap_context(struct ceph_snap_realm *realm)
292{
293 struct ceph_snap_realm *parent = realm->parent;
294 struct ceph_snap_context *snapc;
295 int err = 0;
296 int i;
297 int num = realm->num_prior_parent_snaps + realm->num_snaps;
298
299 /*
300 * build parent context, if it hasn't been built.
301 * conservatively estimate that all parent snaps might be
302 * included by us.
303 */
304 if (parent) {
305 if (!parent->cached_context) {
306 err = build_snap_context(parent);
307 if (err)
308 goto fail;
309 }
310 num += parent->cached_context->num_snaps;
311 }
312
313 /* do i actually need to update? not if my context seq
314 matches realm seq, and my parents' does to. (this works
315 because we rebuild_snap_realms() works _downward_ in
316 hierarchy after each update.) */
317 if (realm->cached_context &&
318 realm->cached_context->seq == realm->seq &&
319 (!parent ||
320 realm->cached_context->seq >= parent->cached_context->seq)) {
321 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
322 " (unchanged)\n",
323 realm->ino, realm, realm->cached_context,
324 realm->cached_context->seq,
325 realm->cached_context->num_snaps);
326 return 0;
327 }
328
329 /* alloc new snap context */
330 err = -ENOMEM;
331 if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
332 goto fail;
333 snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
334 if (!snapc)
335 goto fail;
336 atomic_set(&snapc->nref, 1);
337
338 /* build (reverse sorted) snap vector */
339 num = 0;
340 snapc->seq = realm->seq;
341 if (parent) {
342 /* include any of parent's snaps occuring _after_ my
343 parent became my parent */
344 for (i = 0; i < parent->cached_context->num_snaps; i++)
345 if (parent->cached_context->snaps[i] >=
346 realm->parent_since)
347 snapc->snaps[num++] =
348 parent->cached_context->snaps[i];
349 if (parent->cached_context->seq > snapc->seq)
350 snapc->seq = parent->cached_context->seq;
351 }
352 memcpy(snapc->snaps + num, realm->snaps,
353 sizeof(u64)*realm->num_snaps);
354 num += realm->num_snaps;
355 memcpy(snapc->snaps + num, realm->prior_parent_snaps,
356 sizeof(u64)*realm->num_prior_parent_snaps);
357 num += realm->num_prior_parent_snaps;
358
359 sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
360 snapc->num_snaps = num;
361 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
362 realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
363
364 if (realm->cached_context)
365 ceph_put_snap_context(realm->cached_context);
366 realm->cached_context = snapc;
367 return 0;
368
369fail:
370 /*
371 * if we fail, clear old (incorrect) cached_context... hopefully
372 * we'll have better luck building it later
373 */
374 if (realm->cached_context) {
375 ceph_put_snap_context(realm->cached_context);
376 realm->cached_context = NULL;
377 }
378 pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
379 realm, err);
380 return err;
381}
382
383/*
384 * rebuild snap context for the given realm and all of its children.
385 */
386static void rebuild_snap_realms(struct ceph_snap_realm *realm)
387{
388 struct ceph_snap_realm *child;
389
390 dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
391 build_snap_context(realm);
392
393 list_for_each_entry(child, &realm->children, child_item)
394 rebuild_snap_realms(child);
395}
396
397
398/*
399 * helper to allocate and decode an array of snapids. free prior
400 * instance, if any.
401 */
402static int dup_array(u64 **dst, __le64 *src, int num)
403{
404 int i;
405
406 kfree(*dst);
407 if (num) {
408 *dst = kcalloc(num, sizeof(u64), GFP_NOFS);
409 if (!*dst)
410 return -ENOMEM;
411 for (i = 0; i < num; i++)
412 (*dst)[i] = get_unaligned_le64(src + i);
413 } else {
414 *dst = NULL;
415 }
416 return 0;
417}
418
419
420/*
421 * When a snapshot is applied, the size/mtime inode metadata is queued
422 * in a ceph_cap_snap (one for each snapshot) until writeback
423 * completes and the metadata can be flushed back to the MDS.
424 *
425 * However, if a (sync) write is currently in-progress when we apply
426 * the snapshot, we have to wait until the write succeeds or fails
427 * (and a final size/mtime is known). In this case the
428 * cap_snap->writing = 1, and is said to be "pending." When the write
429 * finishes, we __ceph_finish_cap_snap().
430 *
431 * Caller must hold snap_rwsem for read (i.e., the realm topology won't
432 * change).
433 */
434void ceph_queue_cap_snap(struct ceph_inode_info *ci)
435{
436 struct inode *inode = &ci->vfs_inode;
437 struct ceph_cap_snap *capsnap;
438 int used;
439
440 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
441 if (!capsnap) {
442 pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
443 return;
444 }
445
446 spin_lock(&inode->i_lock);
447 used = __ceph_caps_used(ci);
448 if (__ceph_have_pending_cap_snap(ci)) {
449 /* there is no point in queuing multiple "pending" cap_snaps,
450 as no new writes are allowed to start when pending, so any
451 writes in progress now were started before the previous
452 cap_snap. lucky us. */
453 dout("queue_cap_snap %p already pending\n", inode);
454 kfree(capsnap);
455 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
456 struct ceph_snap_context *snapc = ci->i_head_snapc;
457
458 igrab(inode);
459
460 atomic_set(&capsnap->nref, 1);
461 capsnap->ci = ci;
462 INIT_LIST_HEAD(&capsnap->ci_item);
463 INIT_LIST_HEAD(&capsnap->flushing_item);
464
465 capsnap->follows = snapc->seq - 1;
466 capsnap->issued = __ceph_caps_issued(ci, NULL);
467 capsnap->dirty = __ceph_caps_dirty(ci);
468
469 capsnap->mode = inode->i_mode;
470 capsnap->uid = inode->i_uid;
471 capsnap->gid = inode->i_gid;
472
473 /* fixme? */
474 capsnap->xattr_blob = NULL;
475 capsnap->xattr_len = 0;
476
477 /* dirty page count moved from _head to this cap_snap;
478 all subsequent writes page dirties occur _after_ this
479 snapshot. */
480 capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
481 ci->i_wrbuffer_ref_head = 0;
482 capsnap->context = snapc;
483 ci->i_head_snapc = NULL;
484 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
485
486 if (used & CEPH_CAP_FILE_WR) {
487 dout("queue_cap_snap %p cap_snap %p snapc %p"
488 " seq %llu used WR, now pending\n", inode,
489 capsnap, snapc, snapc->seq);
490 capsnap->writing = 1;
491 } else {
492 /* note mtime, size NOW. */
493 __ceph_finish_cap_snap(ci, capsnap);
494 }
495 } else {
496 dout("queue_cap_snap %p nothing dirty|writing\n", inode);
497 kfree(capsnap);
498 }
499
500 spin_unlock(&inode->i_lock);
501}
502
503/*
504 * Finalize the size, mtime for a cap_snap.. that is, settle on final values
505 * to be used for the snapshot, to be flushed back to the mds.
506 *
507 * If capsnap can now be flushed, add to snap_flush list, and return 1.
508 *
509 * Caller must hold i_lock.
510 */
511int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
512 struct ceph_cap_snap *capsnap)
513{
514 struct inode *inode = &ci->vfs_inode;
515 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
516
517 BUG_ON(capsnap->writing);
518 capsnap->size = inode->i_size;
519 capsnap->mtime = inode->i_mtime;
520 capsnap->atime = inode->i_atime;
521 capsnap->ctime = inode->i_ctime;
522 capsnap->time_warp_seq = ci->i_time_warp_seq;
523 if (capsnap->dirty_pages) {
524 dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
525 "still has %d dirty pages\n", inode, capsnap,
526 capsnap->context, capsnap->context->seq,
527 ceph_cap_string(capsnap->dirty), capsnap->size,
528 capsnap->dirty_pages);
529 return 0;
530 }
531 dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
532 inode, capsnap, capsnap->context,
533 capsnap->context->seq, ceph_cap_string(capsnap->dirty),
534 capsnap->size);
535
536 spin_lock(&mdsc->snap_flush_lock);
537 list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
538 spin_unlock(&mdsc->snap_flush_lock);
539 return 1; /* caller may want to ceph_flush_snaps */
540}
541
542
543/*
544 * Parse and apply a snapblob "snap trace" from the MDS. This specifies
545 * the snap realm parameters from a given realm and all of its ancestors,
546 * up to the root.
547 *
548 * Caller must hold snap_rwsem for write.
549 */
550int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
551 void *p, void *e, bool deletion)
552{
553 struct ceph_mds_snap_realm *ri; /* encoded */
554 __le64 *snaps; /* encoded */
555 __le64 *prior_parent_snaps; /* encoded */
556 struct ceph_snap_realm *realm;
557 int invalidate = 0;
558 int err = -ENOMEM;
559
560 dout("update_snap_trace deletion=%d\n", deletion);
561more:
562 ceph_decode_need(&p, e, sizeof(*ri), bad);
563 ri = p;
564 p += sizeof(*ri);
565 ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
566 le32_to_cpu(ri->num_prior_parent_snaps)), bad);
567 snaps = p;
568 p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
569 prior_parent_snaps = p;
570 p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
571
572 realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
573 if (!realm) {
574 realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
575 if (IS_ERR(realm)) {
576 err = PTR_ERR(realm);
577 goto fail;
578 }
579 }
580
581 if (le64_to_cpu(ri->seq) > realm->seq) {
582 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
583 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
584 /*
585 * if the realm seq has changed, queue a cap_snap for every
586 * inode with open caps. we do this _before_ we update
587 * the realm info so that we prepare for writeback under the
588 * _previous_ snap context.
589 *
590 * ...unless it's a snap deletion!
591 */
592 if (!deletion) {
593 struct ceph_inode_info *ci;
594 struct inode *lastinode = NULL;
595
596 spin_lock(&realm->inodes_with_caps_lock);
597 list_for_each_entry(ci, &realm->inodes_with_caps,
598 i_snap_realm_item) {
599 struct inode *inode = igrab(&ci->vfs_inode);
600 if (!inode)
601 continue;
602 spin_unlock(&realm->inodes_with_caps_lock);
603 if (lastinode)
604 iput(lastinode);
605 lastinode = inode;
606 ceph_queue_cap_snap(ci);
607 spin_lock(&realm->inodes_with_caps_lock);
608 }
609 spin_unlock(&realm->inodes_with_caps_lock);
610 if (lastinode)
611 iput(lastinode);
612 dout("update_snap_trace cap_snaps queued\n");
613 }
614
615 } else {
616 dout("update_snap_trace %llx %p seq %lld unchanged\n",
617 realm->ino, realm, realm->seq);
618 }
619
620 /* ensure the parent is correct */
621 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
622 if (err < 0)
623 goto fail;
624 invalidate += err;
625
626 if (le64_to_cpu(ri->seq) > realm->seq) {
627 /* update realm parameters, snap lists */
628 realm->seq = le64_to_cpu(ri->seq);
629 realm->created = le64_to_cpu(ri->created);
630 realm->parent_since = le64_to_cpu(ri->parent_since);
631
632 realm->num_snaps = le32_to_cpu(ri->num_snaps);
633 err = dup_array(&realm->snaps, snaps, realm->num_snaps);
634 if (err < 0)
635 goto fail;
636
637 realm->num_prior_parent_snaps =
638 le32_to_cpu(ri->num_prior_parent_snaps);
639 err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
640 realm->num_prior_parent_snaps);
641 if (err < 0)
642 goto fail;
643
644 invalidate = 1;
645 } else if (!realm->cached_context) {
646 invalidate = 1;
647 }
648
649 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
650 realm, invalidate, p, e);
651
652 if (p < e)
653 goto more;
654
655 /* invalidate when we reach the _end_ (root) of the trace */
656 if (invalidate)
657 rebuild_snap_realms(realm);
658
659 __cleanup_empty_realms(mdsc);
660 return 0;
661
662bad:
663 err = -EINVAL;
664fail:
665 pr_err("update_snap_trace error %d\n", err);
666 return err;
667}
668
669
670/*
671 * Send any cap_snaps that are queued for flush. Try to carry
672 * s_mutex across multiple snap flushes to avoid locking overhead.
673 *
674 * Caller holds no locks.
675 */
676static void flush_snaps(struct ceph_mds_client *mdsc)
677{
678 struct ceph_inode_info *ci;
679 struct inode *inode;
680 struct ceph_mds_session *session = NULL;
681
682 dout("flush_snaps\n");
683 spin_lock(&mdsc->snap_flush_lock);
684 while (!list_empty(&mdsc->snap_flush_list)) {
685 ci = list_first_entry(&mdsc->snap_flush_list,
686 struct ceph_inode_info, i_snap_flush_item);
687 inode = &ci->vfs_inode;
688 igrab(inode);
689 spin_unlock(&mdsc->snap_flush_lock);
690 spin_lock(&inode->i_lock);
691 __ceph_flush_snaps(ci, &session);
692 spin_unlock(&inode->i_lock);
693 iput(inode);
694 spin_lock(&mdsc->snap_flush_lock);
695 }
696 spin_unlock(&mdsc->snap_flush_lock);
697
698 if (session) {
699 mutex_unlock(&session->s_mutex);
700 ceph_put_mds_session(session);
701 }
702 dout("flush_snaps done\n");
703}
704
705
706/*
707 * Handle a snap notification from the MDS.
708 *
709 * This can take two basic forms: the simplest is just a snap creation
710 * or deletion notification on an existing realm. This should update the
711 * realm and its children.
712 *
713 * The more difficult case is realm creation, due to snap creation at a
714 * new point in the file hierarchy, or due to a rename that moves a file or
715 * directory into another realm.
716 */
717void ceph_handle_snap(struct ceph_mds_client *mdsc,
718 struct ceph_mds_session *session,
719 struct ceph_msg *msg)
720{
721 struct super_block *sb = mdsc->client->sb;
722 int mds = session->s_mds;
723 u64 split;
724 int op;
725 int trace_len;
726 struct ceph_snap_realm *realm = NULL;
727 void *p = msg->front.iov_base;
728 void *e = p + msg->front.iov_len;
729 struct ceph_mds_snap_head *h;
730 int num_split_inos, num_split_realms;
731 __le64 *split_inos = NULL, *split_realms = NULL;
732 int i;
733 int locked_rwsem = 0;
734
735 /* decode */
736 if (msg->front.iov_len < sizeof(*h))
737 goto bad;
738 h = p;
739 op = le32_to_cpu(h->op);
740 split = le64_to_cpu(h->split); /* non-zero if we are splitting an
741 * existing realm */
742 num_split_inos = le32_to_cpu(h->num_split_inos);
743 num_split_realms = le32_to_cpu(h->num_split_realms);
744 trace_len = le32_to_cpu(h->trace_len);
745 p += sizeof(*h);
746
747 dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
748 ceph_snap_op_name(op), split, trace_len);
749
750 mutex_lock(&session->s_mutex);
751 session->s_seq++;
752 mutex_unlock(&session->s_mutex);
753
754 down_write(&mdsc->snap_rwsem);
755 locked_rwsem = 1;
756
757 if (op == CEPH_SNAP_OP_SPLIT) {
758 struct ceph_mds_snap_realm *ri;
759
760 /*
761 * A "split" breaks part of an existing realm off into
762 * a new realm. The MDS provides a list of inodes
763 * (with caps) and child realms that belong to the new
764 * child.
765 */
766 split_inos = p;
767 p += sizeof(u64) * num_split_inos;
768 split_realms = p;
769 p += sizeof(u64) * num_split_realms;
770 ceph_decode_need(&p, e, sizeof(*ri), bad);
771 /* we will peek at realm info here, but will _not_
772 * advance p, as the realm update will occur below in
773 * ceph_update_snap_trace. */
774 ri = p;
775
776 realm = ceph_lookup_snap_realm(mdsc, split);
777 if (!realm) {
778 realm = ceph_create_snap_realm(mdsc, split);
779 if (IS_ERR(realm))
780 goto out;
781 }
782 ceph_get_snap_realm(mdsc, realm);
783
784 dout("splitting snap_realm %llx %p\n", realm->ino, realm);
785 for (i = 0; i < num_split_inos; i++) {
786 struct ceph_vino vino = {
787 .ino = le64_to_cpu(split_inos[i]),
788 .snap = CEPH_NOSNAP,
789 };
790 struct inode *inode = ceph_find_inode(sb, vino);
791 struct ceph_inode_info *ci;
792
793 if (!inode)
794 continue;
795 ci = ceph_inode(inode);
796
797 spin_lock(&inode->i_lock);
798 if (!ci->i_snap_realm)
799 goto skip_inode;
800 /*
801 * If this inode belongs to a realm that was
802 * created after our new realm, we experienced
803 * a race (due to another split notifications
804 * arriving from a different MDS). So skip
805 * this inode.
806 */
807 if (ci->i_snap_realm->created >
808 le64_to_cpu(ri->created)) {
809 dout(" leaving %p in newer realm %llx %p\n",
810 inode, ci->i_snap_realm->ino,
811 ci->i_snap_realm);
812 goto skip_inode;
813 }
814 dout(" will move %p to split realm %llx %p\n",
815 inode, realm->ino, realm);
816 /*
817 * Remove the inode from the realm's inode
818 * list, but don't add it to the new realm
819 * yet. We don't want the cap_snap to be
820 * queued (again) by ceph_update_snap_trace()
821 * below. Queue it _now_, under the old context.
822 */
823 spin_lock(&realm->inodes_with_caps_lock);
824 list_del_init(&ci->i_snap_realm_item);
825 spin_unlock(&realm->inodes_with_caps_lock);
826 spin_unlock(&inode->i_lock);
827
828 ceph_queue_cap_snap(ci);
829
830 iput(inode);
831 continue;
832
833skip_inode:
834 spin_unlock(&inode->i_lock);
835 iput(inode);
836 }
837
838 /* we may have taken some of the old realm's children. */
839 for (i = 0; i < num_split_realms; i++) {
840 struct ceph_snap_realm *child =
841 ceph_lookup_snap_realm(mdsc,
842 le64_to_cpu(split_realms[i]));
843 if (!child)
844 continue;
845 adjust_snap_realm_parent(mdsc, child, realm->ino);
846 }
847 }
848
849 /*
850 * update using the provided snap trace. if we are deleting a
851 * snap, we can avoid queueing cap_snaps.
852 */
853 ceph_update_snap_trace(mdsc, p, e,
854 op == CEPH_SNAP_OP_DESTROY);
855
856 if (op == CEPH_SNAP_OP_SPLIT) {
857 /*
858 * ok, _now_ add the inodes into the new realm.
859 */
860 for (i = 0; i < num_split_inos; i++) {
861 struct ceph_vino vino = {
862 .ino = le64_to_cpu(split_inos[i]),
863 .snap = CEPH_NOSNAP,
864 };
865 struct inode *inode = ceph_find_inode(sb, vino);
866 struct ceph_inode_info *ci;
867
868 if (!inode)
869 continue;
870 ci = ceph_inode(inode);
871 spin_lock(&inode->i_lock);
872 if (!ci->i_snap_realm)
873 goto split_skip_inode;
874 ceph_put_snap_realm(mdsc, ci->i_snap_realm);
875 spin_lock(&realm->inodes_with_caps_lock);
876 list_add(&ci->i_snap_realm_item,
877 &realm->inodes_with_caps);
878 ci->i_snap_realm = realm;
879 spin_unlock(&realm->inodes_with_caps_lock);
880 ceph_get_snap_realm(mdsc, realm);
881split_skip_inode:
882 spin_unlock(&inode->i_lock);
883 iput(inode);
884 }
885
886 /* we took a reference when we created the realm, above */
887 ceph_put_snap_realm(mdsc, realm);
888 }
889
890 __cleanup_empty_realms(mdsc);
891
892 up_write(&mdsc->snap_rwsem);
893
894 flush_snaps(mdsc);
895 return;
896
897bad:
898 pr_err("corrupt snap message from mds%d\n", mds);
899 ceph_msg_dump(msg);
900out:
901 if (locked_rwsem)
902 up_write(&mdsc->snap_rwsem);
903 return;
904}
905
906
907
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 000000000000..75d02eaa1279
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,1031 @@
1
2#include "ceph_debug.h"
3
4#include <linux/backing-dev.h>
5#include <linux/fs.h>
6#include <linux/inet.h>
7#include <linux/in6.h>
8#include <linux/module.h>
9#include <linux/mount.h>
10#include <linux/parser.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h>
13#include <linux/seq_file.h>
14#include <linux/slab.h>
15#include <linux/statfs.h>
16#include <linux/string.h>
17#include <linux/version.h>
18#include <linux/vmalloc.h>
19
20#include "decode.h"
21#include "super.h"
22#include "mon_client.h"
23#include "auth.h"
24
25/*
26 * Ceph superblock operations
27 *
28 * Handle the basics of mounting, unmounting.
29 */
30
31
32/*
33 * find filename portion of a path (/foo/bar/baz -> baz)
34 */
35const char *ceph_file_part(const char *s, int len)
36{
37 const char *e = s + len;
38
39 while (e != s && *(e-1) != '/')
40 e--;
41 return e;
42}
43
44
45/*
46 * super ops
47 */
48static void ceph_put_super(struct super_block *s)
49{
50 struct ceph_client *cl = ceph_client(s);
51
52 dout("put_super\n");
53 ceph_mdsc_close_sessions(&cl->mdsc);
54 return;
55}
56
57static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
58{
59 struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
60 struct ceph_monmap *monmap = client->monc.monmap;
61 struct ceph_statfs st;
62 u64 fsid;
63 int err;
64
65 dout("statfs\n");
66 err = ceph_monc_do_statfs(&client->monc, &st);
67 if (err < 0)
68 return err;
69
70 /* fill in kstatfs */
71 buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
72
73 /*
74 * express utilization in terms of large blocks to avoid
75 * overflow on 32-bit machines.
76 */
77 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
78 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
79 buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
80 (CEPH_BLOCK_SHIFT-10);
81 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
82
83 buf->f_files = le64_to_cpu(st.num_objects);
84 buf->f_ffree = -1;
85 buf->f_namelen = PATH_MAX;
86 buf->f_frsize = PAGE_CACHE_SIZE;
87
88 /* leave fsid little-endian, regardless of host endianness */
89 fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
90 buf->f_fsid.val[0] = fsid & 0xffffffff;
91 buf->f_fsid.val[1] = fsid >> 32;
92
93 return 0;
94}
95
96
97static int ceph_syncfs(struct super_block *sb, int wait)
98{
99 dout("sync_fs %d\n", wait);
100 ceph_osdc_sync(&ceph_client(sb)->osdc);
101 ceph_mdsc_sync(&ceph_client(sb)->mdsc);
102 dout("sync_fs %d done\n", wait);
103 return 0;
104}
105
106
107/**
108 * ceph_show_options - Show mount options in /proc/mounts
109 * @m: seq_file to write to
110 * @mnt: mount descriptor
111 */
112static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
113{
114 struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
115 struct ceph_mount_args *args = client->mount_args;
116
117 if (args->flags & CEPH_OPT_FSID)
118 seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
119 le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
120 le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
121 if (args->flags & CEPH_OPT_NOSHARE)
122 seq_puts(m, ",noshare");
123 if (args->flags & CEPH_OPT_DIRSTAT)
124 seq_puts(m, ",dirstat");
125 if ((args->flags & CEPH_OPT_RBYTES) == 0)
126 seq_puts(m, ",norbytes");
127 if (args->flags & CEPH_OPT_NOCRC)
128 seq_puts(m, ",nocrc");
129 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
130 seq_puts(m, ",noasyncreaddir");
131 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
132 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
133 if (args->name)
134 seq_printf(m, ",name=%s", args->name);
135 if (args->secret)
136 seq_puts(m, ",secret=<hidden>");
137 return 0;
138}
139
140/*
141 * caches
142 */
143struct kmem_cache *ceph_inode_cachep;
144struct kmem_cache *ceph_cap_cachep;
145struct kmem_cache *ceph_dentry_cachep;
146struct kmem_cache *ceph_file_cachep;
147
148static void ceph_inode_init_once(void *foo)
149{
150 struct ceph_inode_info *ci = foo;
151 inode_init_once(&ci->vfs_inode);
152}
153
154static int default_congestion_kb(void)
155{
156 int congestion_kb;
157
158 /*
159 * Copied from NFS
160 *
161 * congestion size, scale with available memory.
162 *
163 * 64MB: 8192k
164 * 128MB: 11585k
165 * 256MB: 16384k
166 * 512MB: 23170k
167 * 1GB: 32768k
168 * 2GB: 46340k
169 * 4GB: 65536k
170 * 8GB: 92681k
171 * 16GB: 131072k
172 *
173 * This allows larger machines to have larger/more transfers.
174 * Limit the default to 256M
175 */
176 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
177 if (congestion_kb > 256*1024)
178 congestion_kb = 256*1024;
179
180 return congestion_kb;
181}
182
183static int __init init_caches(void)
184{
185 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
186 sizeof(struct ceph_inode_info),
187 __alignof__(struct ceph_inode_info),
188 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
189 ceph_inode_init_once);
190 if (ceph_inode_cachep == NULL)
191 return -ENOMEM;
192
193 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
194 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
195 if (ceph_cap_cachep == NULL)
196 goto bad_cap;
197
198 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
199 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
200 if (ceph_dentry_cachep == NULL)
201 goto bad_dentry;
202
203 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
204 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
205 if (ceph_file_cachep == NULL)
206 goto bad_file;
207
208 return 0;
209
210bad_file:
211 kmem_cache_destroy(ceph_dentry_cachep);
212bad_dentry:
213 kmem_cache_destroy(ceph_cap_cachep);
214bad_cap:
215 kmem_cache_destroy(ceph_inode_cachep);
216 return -ENOMEM;
217}
218
219static void destroy_caches(void)
220{
221 kmem_cache_destroy(ceph_inode_cachep);
222 kmem_cache_destroy(ceph_cap_cachep);
223 kmem_cache_destroy(ceph_dentry_cachep);
224 kmem_cache_destroy(ceph_file_cachep);
225}
226
227
228/*
229 * ceph_umount_begin - initiate forced umount. Tear down down the
230 * mount, skipping steps that may hang while waiting for server(s).
231 */
232static void ceph_umount_begin(struct super_block *sb)
233{
234 struct ceph_client *client = ceph_sb_to_client(sb);
235
236 dout("ceph_umount_begin - starting forced umount\n");
237 if (!client)
238 return;
239 client->mount_state = CEPH_MOUNT_SHUTDOWN;
240 return;
241}
242
243static const struct super_operations ceph_super_ops = {
244 .alloc_inode = ceph_alloc_inode,
245 .destroy_inode = ceph_destroy_inode,
246 .write_inode = ceph_write_inode,
247 .sync_fs = ceph_syncfs,
248 .put_super = ceph_put_super,
249 .show_options = ceph_show_options,
250 .statfs = ceph_statfs,
251 .umount_begin = ceph_umount_begin,
252};
253
254
255const char *ceph_msg_type_name(int type)
256{
257 switch (type) {
258 case CEPH_MSG_SHUTDOWN: return "shutdown";
259 case CEPH_MSG_PING: return "ping";
260 case CEPH_MSG_AUTH: return "auth";
261 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
262 case CEPH_MSG_MON_MAP: return "mon_map";
263 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
264 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
265 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
266 case CEPH_MSG_STATFS: return "statfs";
267 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
268 case CEPH_MSG_MDS_MAP: return "mds_map";
269 case CEPH_MSG_CLIENT_SESSION: return "client_session";
270 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
271 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
272 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
273 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
274 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
275 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
276 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
277 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
278 case CEPH_MSG_OSD_MAP: return "osd_map";
279 case CEPH_MSG_OSD_OP: return "osd_op";
280 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
281 default: return "unknown";
282 }
283}
284
285
286/*
287 * mount options
288 */
289enum {
290 Opt_fsidmajor,
291 Opt_fsidminor,
292 Opt_monport,
293 Opt_wsize,
294 Opt_rsize,
295 Opt_osdtimeout,
296 Opt_osdkeepalivetimeout,
297 Opt_mount_timeout,
298 Opt_osd_idle_ttl,
299 Opt_caps_wanted_delay_min,
300 Opt_caps_wanted_delay_max,
301 Opt_readdir_max_entries,
302 Opt_congestion_kb,
303 Opt_last_int,
304 /* int args above */
305 Opt_snapdirname,
306 Opt_name,
307 Opt_secret,
308 Opt_last_string,
309 /* string args above */
310 Opt_ip,
311 Opt_noshare,
312 Opt_dirstat,
313 Opt_nodirstat,
314 Opt_rbytes,
315 Opt_norbytes,
316 Opt_nocrc,
317 Opt_noasyncreaddir,
318};
319
320static match_table_t arg_tokens = {
321 {Opt_fsidmajor, "fsidmajor=%ld"},
322 {Opt_fsidminor, "fsidminor=%ld"},
323 {Opt_monport, "monport=%d"},
324 {Opt_wsize, "wsize=%d"},
325 {Opt_rsize, "rsize=%d"},
326 {Opt_osdtimeout, "osdtimeout=%d"},
327 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
328 {Opt_mount_timeout, "mount_timeout=%d"},
329 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
330 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
331 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
332 {Opt_readdir_max_entries, "readdir_max_entries=%d"},
333 {Opt_congestion_kb, "write_congestion_kb=%d"},
334 /* int args above */
335 {Opt_snapdirname, "snapdirname=%s"},
336 {Opt_name, "name=%s"},
337 {Opt_secret, "secret=%s"},
338 /* string args above */
339 {Opt_ip, "ip=%s"},
340 {Opt_noshare, "noshare"},
341 {Opt_dirstat, "dirstat"},
342 {Opt_nodirstat, "nodirstat"},
343 {Opt_rbytes, "rbytes"},
344 {Opt_norbytes, "norbytes"},
345 {Opt_nocrc, "nocrc"},
346 {Opt_noasyncreaddir, "noasyncreaddir"},
347 {-1, NULL}
348};
349
350
351static struct ceph_mount_args *parse_mount_args(int flags, char *options,
352 const char *dev_name,
353 const char **path)
354{
355 struct ceph_mount_args *args;
356 const char *c;
357 int err = -ENOMEM;
358 substring_t argstr[MAX_OPT_ARGS];
359
360 args = kzalloc(sizeof(*args), GFP_KERNEL);
361 if (!args)
362 return ERR_PTR(-ENOMEM);
363 args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
364 GFP_KERNEL);
365 if (!args->mon_addr)
366 goto out;
367
368 dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
369
370 /* start with defaults */
371 args->sb_flags = flags;
372 args->flags = CEPH_OPT_DEFAULT;
373 args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
374 args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
375 args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
376 args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
377 args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
378 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
379 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
380 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
381 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
382 args->max_readdir = 1024;
383 args->congestion_kb = default_congestion_kb();
384
385 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
386 err = -EINVAL;
387 if (!dev_name)
388 goto out;
389 *path = strstr(dev_name, ":/");
390 if (*path == NULL) {
391 pr_err("device name is missing path (no :/ in %s)\n",
392 dev_name);
393 goto out;
394 }
395
396 /* get mon ip(s) */
397 err = ceph_parse_ips(dev_name, *path, args->mon_addr,
398 CEPH_MAX_MON, &args->num_mon);
399 if (err < 0)
400 goto out;
401
402 /* path on server */
403 *path += 2;
404 dout("server path '%s'\n", *path);
405
406 /* parse mount options */
407 while ((c = strsep(&options, ",")) != NULL) {
408 int token, intval, ret;
409 if (!*c)
410 continue;
411 err = -EINVAL;
412 token = match_token((char *)c, arg_tokens, argstr);
413 if (token < 0) {
414 pr_err("bad mount option at '%s'\n", c);
415 goto out;
416 }
417 if (token < Opt_last_int) {
418 ret = match_int(&argstr[0], &intval);
419 if (ret < 0) {
420 pr_err("bad mount option arg (not int) "
421 "at '%s'\n", c);
422 continue;
423 }
424 dout("got int token %d val %d\n", token, intval);
425 } else if (token > Opt_last_int && token < Opt_last_string) {
426 dout("got string token %d val %s\n", token,
427 argstr[0].from);
428 } else {
429 dout("got token %d\n", token);
430 }
431 switch (token) {
432 case Opt_fsidmajor:
433 *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
434 break;
435 case Opt_fsidminor:
436 *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
437 break;
438 case Opt_ip:
439 err = ceph_parse_ips(argstr[0].from,
440 argstr[0].to,
441 &args->my_addr,
442 1, NULL);
443 if (err < 0)
444 goto out;
445 args->flags |= CEPH_OPT_MYIP;
446 break;
447
448 case Opt_snapdirname:
449 kfree(args->snapdir_name);
450 args->snapdir_name = kstrndup(argstr[0].from,
451 argstr[0].to-argstr[0].from,
452 GFP_KERNEL);
453 break;
454 case Opt_name:
455 args->name = kstrndup(argstr[0].from,
456 argstr[0].to-argstr[0].from,
457 GFP_KERNEL);
458 break;
459 case Opt_secret:
460 args->secret = kstrndup(argstr[0].from,
461 argstr[0].to-argstr[0].from,
462 GFP_KERNEL);
463 break;
464
465 /* misc */
466 case Opt_wsize:
467 args->wsize = intval;
468 break;
469 case Opt_rsize:
470 args->rsize = intval;
471 break;
472 case Opt_osdtimeout:
473 args->osd_timeout = intval;
474 break;
475 case Opt_osdkeepalivetimeout:
476 args->osd_keepalive_timeout = intval;
477 break;
478 case Opt_mount_timeout:
479 args->mount_timeout = intval;
480 break;
481 case Opt_caps_wanted_delay_min:
482 args->caps_wanted_delay_min = intval;
483 break;
484 case Opt_caps_wanted_delay_max:
485 args->caps_wanted_delay_max = intval;
486 break;
487 case Opt_readdir_max_entries:
488 args->max_readdir = intval;
489 break;
490 case Opt_congestion_kb:
491 args->congestion_kb = intval;
492 break;
493
494 case Opt_noshare:
495 args->flags |= CEPH_OPT_NOSHARE;
496 break;
497
498 case Opt_dirstat:
499 args->flags |= CEPH_OPT_DIRSTAT;
500 break;
501 case Opt_nodirstat:
502 args->flags &= ~CEPH_OPT_DIRSTAT;
503 break;
504 case Opt_rbytes:
505 args->flags |= CEPH_OPT_RBYTES;
506 break;
507 case Opt_norbytes:
508 args->flags &= ~CEPH_OPT_RBYTES;
509 break;
510 case Opt_nocrc:
511 args->flags |= CEPH_OPT_NOCRC;
512 break;
513 case Opt_noasyncreaddir:
514 args->flags |= CEPH_OPT_NOASYNCREADDIR;
515 break;
516
517 default:
518 BUG_ON(token);
519 }
520 }
521 return args;
522
523out:
524 kfree(args->mon_addr);
525 kfree(args);
526 return ERR_PTR(err);
527}
528
529static void destroy_mount_args(struct ceph_mount_args *args)
530{
531 dout("destroy_mount_args %p\n", args);
532 kfree(args->snapdir_name);
533 args->snapdir_name = NULL;
534 kfree(args->name);
535 args->name = NULL;
536 kfree(args->secret);
537 args->secret = NULL;
538 kfree(args);
539}
540
541/*
542 * create a fresh client instance
543 */
544static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
545{
546 struct ceph_client *client;
547 int err = -ENOMEM;
548
549 client = kzalloc(sizeof(*client), GFP_KERNEL);
550 if (client == NULL)
551 return ERR_PTR(-ENOMEM);
552
553 mutex_init(&client->mount_mutex);
554
555 init_waitqueue_head(&client->auth_wq);
556
557 client->sb = NULL;
558 client->mount_state = CEPH_MOUNT_MOUNTING;
559 client->mount_args = args;
560
561 client->msgr = NULL;
562
563 client->auth_err = 0;
564 atomic_long_set(&client->writeback_count, 0);
565
566 err = bdi_init(&client->backing_dev_info);
567 if (err < 0)
568 goto fail;
569
570 err = -ENOMEM;
571 client->wb_wq = create_workqueue("ceph-writeback");
572 if (client->wb_wq == NULL)
573 goto fail_bdi;
574 client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
575 if (client->pg_inv_wq == NULL)
576 goto fail_wb_wq;
577 client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
578 if (client->trunc_wq == NULL)
579 goto fail_pg_inv_wq;
580
581 /* set up mempools */
582 err = -ENOMEM;
583 client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
584 client->mount_args->wsize >> PAGE_CACHE_SHIFT);
585 if (!client->wb_pagevec_pool)
586 goto fail_trunc_wq;
587
588 /* caps */
589 client->min_caps = args->max_readdir;
590 ceph_adjust_min_caps(client->min_caps);
591
592 /* subsystems */
593 err = ceph_monc_init(&client->monc, client);
594 if (err < 0)
595 goto fail_mempool;
596 err = ceph_osdc_init(&client->osdc, client);
597 if (err < 0)
598 goto fail_monc;
599 err = ceph_mdsc_init(&client->mdsc, client);
600 if (err < 0)
601 goto fail_osdc;
602 return client;
603
604fail_osdc:
605 ceph_osdc_stop(&client->osdc);
606fail_monc:
607 ceph_monc_stop(&client->monc);
608fail_mempool:
609 mempool_destroy(client->wb_pagevec_pool);
610fail_trunc_wq:
611 destroy_workqueue(client->trunc_wq);
612fail_pg_inv_wq:
613 destroy_workqueue(client->pg_inv_wq);
614fail_wb_wq:
615 destroy_workqueue(client->wb_wq);
616fail_bdi:
617 bdi_destroy(&client->backing_dev_info);
618fail:
619 kfree(client);
620 return ERR_PTR(err);
621}
622
623static void ceph_destroy_client(struct ceph_client *client)
624{
625 dout("destroy_client %p\n", client);
626
627 /* unmount */
628 ceph_mdsc_stop(&client->mdsc);
629 ceph_monc_stop(&client->monc);
630 ceph_osdc_stop(&client->osdc);
631
632 ceph_adjust_min_caps(-client->min_caps);
633
634 ceph_debugfs_client_cleanup(client);
635 destroy_workqueue(client->wb_wq);
636 destroy_workqueue(client->pg_inv_wq);
637 destroy_workqueue(client->trunc_wq);
638
639 if (client->msgr)
640 ceph_messenger_destroy(client->msgr);
641 mempool_destroy(client->wb_pagevec_pool);
642
643 destroy_mount_args(client->mount_args);
644
645 kfree(client);
646 dout("destroy_client %p done\n", client);
647}
648
649/*
650 * Initially learn our fsid, or verify an fsid matches.
651 */
652int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
653{
654 if (client->have_fsid) {
655 if (ceph_fsid_compare(&client->fsid, fsid)) {
656 pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
657 PR_FSID(&client->fsid), PR_FSID(fsid));
658 return -1;
659 }
660 } else {
661 pr_info("client%lld fsid " FSID_FORMAT "\n",
662 client->monc.auth->global_id, PR_FSID(fsid));
663 memcpy(&client->fsid, fsid, sizeof(*fsid));
664 ceph_debugfs_client_init(client);
665 client->have_fsid = true;
666 }
667 return 0;
668}
669
670/*
671 * true if we have the mon map (and have thus joined the cluster)
672 */
673static int have_mon_map(struct ceph_client *client)
674{
675 return client->monc.monmap && client->monc.monmap->epoch;
676}
677
678/*
679 * Bootstrap mount by opening the root directory. Note the mount
680 * @started time from caller, and time out if this takes too long.
681 */
682static struct dentry *open_root_dentry(struct ceph_client *client,
683 const char *path,
684 unsigned long started)
685{
686 struct ceph_mds_client *mdsc = &client->mdsc;
687 struct ceph_mds_request *req = NULL;
688 int err;
689 struct dentry *root;
690
691 /* open dir */
692 dout("open_root_inode opening '%s'\n", path);
693 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
694 if (IS_ERR(req))
695 return ERR_PTR(PTR_ERR(req));
696 req->r_path1 = kstrdup(path, GFP_NOFS);
697 req->r_ino1.ino = CEPH_INO_ROOT;
698 req->r_ino1.snap = CEPH_NOSNAP;
699 req->r_started = started;
700 req->r_timeout = client->mount_args->mount_timeout * HZ;
701 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
702 req->r_num_caps = 2;
703 err = ceph_mdsc_do_request(mdsc, NULL, req);
704 if (err == 0) {
705 dout("open_root_inode success\n");
706 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
707 client->sb->s_root == NULL)
708 root = d_alloc_root(req->r_target_inode);
709 else
710 root = d_obtain_alias(req->r_target_inode);
711 req->r_target_inode = NULL;
712 dout("open_root_inode success, root dentry is %p\n", root);
713 } else {
714 root = ERR_PTR(err);
715 }
716 ceph_mdsc_put_request(req);
717 return root;
718}
719
720/*
721 * mount: join the ceph cluster, and open root directory.
722 */
723static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
724 const char *path)
725{
726 struct ceph_entity_addr *myaddr = NULL;
727 int err;
728 unsigned long timeout = client->mount_args->mount_timeout * HZ;
729 unsigned long started = jiffies; /* note the start time */
730 struct dentry *root;
731
732 dout("mount start\n");
733 mutex_lock(&client->mount_mutex);
734
735 /* initialize the messenger */
736 if (client->msgr == NULL) {
737 if (ceph_test_opt(client, MYIP))
738 myaddr = &client->mount_args->my_addr;
739 client->msgr = ceph_messenger_create(myaddr);
740 if (IS_ERR(client->msgr)) {
741 err = PTR_ERR(client->msgr);
742 client->msgr = NULL;
743 goto out;
744 }
745 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
746 }
747
748 /* open session, and wait for mon, mds, and osd maps */
749 err = ceph_monc_open_session(&client->monc);
750 if (err < 0)
751 goto out;
752
753 while (!have_mon_map(client)) {
754 err = -EIO;
755 if (timeout && time_after_eq(jiffies, started + timeout))
756 goto out;
757
758 /* wait */
759 dout("mount waiting for mon_map\n");
760 err = wait_event_interruptible_timeout(client->auth_wq,
761 have_mon_map(client) || (client->auth_err < 0),
762 timeout);
763 if (err == -EINTR || err == -ERESTARTSYS)
764 goto out;
765 if (client->auth_err < 0) {
766 err = client->auth_err;
767 goto out;
768 }
769 }
770
771 dout("mount opening root\n");
772 root = open_root_dentry(client, "", started);
773 if (IS_ERR(root)) {
774 err = PTR_ERR(root);
775 goto out;
776 }
777 if (client->sb->s_root)
778 dput(root);
779 else
780 client->sb->s_root = root;
781
782 if (path[0] == 0) {
783 dget(root);
784 } else {
785 dout("mount opening base mountpoint\n");
786 root = open_root_dentry(client, path, started);
787 if (IS_ERR(root)) {
788 err = PTR_ERR(root);
789 dput(client->sb->s_root);
790 client->sb->s_root = NULL;
791 goto out;
792 }
793 }
794
795 mnt->mnt_root = root;
796 mnt->mnt_sb = client->sb;
797
798 client->mount_state = CEPH_MOUNT_MOUNTED;
799 dout("mount success\n");
800 err = 0;
801
802out:
803 mutex_unlock(&client->mount_mutex);
804 return err;
805}
806
807static int ceph_set_super(struct super_block *s, void *data)
808{
809 struct ceph_client *client = data;
810 int ret;
811
812 dout("set_super %p data %p\n", s, data);
813
814 s->s_flags = client->mount_args->sb_flags;
815 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
816
817 s->s_fs_info = client;
818 client->sb = s;
819
820 s->s_op = &ceph_super_ops;
821 s->s_export_op = &ceph_export_ops;
822
823 s->s_time_gran = 1000; /* 1000 ns == 1 us */
824
825 ret = set_anon_super(s, NULL); /* what is that second arg for? */
826 if (ret != 0)
827 goto fail;
828
829 return ret;
830
831fail:
832 s->s_fs_info = NULL;
833 client->sb = NULL;
834 return ret;
835}
836
837/*
838 * share superblock if same fs AND options
839 */
840static int ceph_compare_super(struct super_block *sb, void *data)
841{
842 struct ceph_client *new = data;
843 struct ceph_mount_args *args = new->mount_args;
844 struct ceph_client *other = ceph_sb_to_client(sb);
845 int i;
846
847 dout("ceph_compare_super %p\n", sb);
848 if (args->flags & CEPH_OPT_FSID) {
849 if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
850 dout("fsid doesn't match\n");
851 return 0;
852 }
853 } else {
854 /* do we share (a) monitor? */
855 for (i = 0; i < new->monc.monmap->num_mon; i++)
856 if (ceph_monmap_contains(other->monc.monmap,
857 &new->monc.monmap->mon_inst[i].addr))
858 break;
859 if (i == new->monc.monmap->num_mon) {
860 dout("mon ip not part of monmap\n");
861 return 0;
862 }
863 dout("mon ip matches existing sb %p\n", sb);
864 }
865 if (args->sb_flags != other->mount_args->sb_flags) {
866 dout("flags differ\n");
867 return 0;
868 }
869 return 1;
870}
871
872/*
873 * construct our own bdi so we can control readahead, etc.
874 */
875static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
876{
877 int err;
878
879 sb->s_bdi = &client->backing_dev_info;
880
881 /* set ra_pages based on rsize mount option? */
882 if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
883 client->backing_dev_info.ra_pages =
884 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
885 >> PAGE_SHIFT;
886 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
887 return err;
888}
889
890static int ceph_get_sb(struct file_system_type *fs_type,
891 int flags, const char *dev_name, void *data,
892 struct vfsmount *mnt)
893{
894 struct super_block *sb;
895 struct ceph_client *client;
896 int err;
897 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
898 const char *path = NULL;
899 struct ceph_mount_args *args;
900
901 dout("ceph_get_sb\n");
902 args = parse_mount_args(flags, data, dev_name, &path);
903 if (IS_ERR(args)) {
904 err = PTR_ERR(args);
905 goto out_final;
906 }
907
908 /* create client (which we may/may not use) */
909 client = ceph_create_client(args);
910 if (IS_ERR(client)) {
911 err = PTR_ERR(client);
912 goto out_final;
913 }
914
915 if (client->mount_args->flags & CEPH_OPT_NOSHARE)
916 compare_super = NULL;
917 sb = sget(fs_type, compare_super, ceph_set_super, client);
918 if (IS_ERR(sb)) {
919 err = PTR_ERR(sb);
920 goto out;
921 }
922
923 if (ceph_client(sb) != client) {
924 ceph_destroy_client(client);
925 client = ceph_client(sb);
926 dout("get_sb got existing client %p\n", client);
927 } else {
928 dout("get_sb using new client %p\n", client);
929 err = ceph_register_bdi(sb, client);
930 if (err < 0)
931 goto out_splat;
932 }
933
934 err = ceph_mount(client, mnt, path);
935 if (err < 0)
936 goto out_splat;
937 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
938 mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
939 return 0;
940
941out_splat:
942 ceph_mdsc_close_sessions(&client->mdsc);
943 up_write(&sb->s_umount);
944 deactivate_super(sb);
945 goto out_final;
946
947out:
948 ceph_destroy_client(client);
949out_final:
950 dout("ceph_get_sb fail %d\n", err);
951 return err;
952}
953
954static void ceph_kill_sb(struct super_block *s)
955{
956 struct ceph_client *client = ceph_sb_to_client(s);
957 dout("kill_sb %p\n", s);
958 ceph_mdsc_pre_umount(&client->mdsc);
959 kill_anon_super(s); /* will call put_super after sb is r/o */
960 if (s->s_bdi == &client->backing_dev_info)
961 bdi_unregister(&client->backing_dev_info);
962 bdi_destroy(&client->backing_dev_info);
963 ceph_destroy_client(client);
964}
965
966static struct file_system_type ceph_fs_type = {
967 .owner = THIS_MODULE,
968 .name = "ceph",
969 .get_sb = ceph_get_sb,
970 .kill_sb = ceph_kill_sb,
971 .fs_flags = FS_RENAME_DOES_D_MOVE,
972};
973
974#define _STRINGIFY(x) #x
975#define STRINGIFY(x) _STRINGIFY(x)
976
977static int __init init_ceph(void)
978{
979 int ret = 0;
980
981 ret = ceph_debugfs_init();
982 if (ret < 0)
983 goto out;
984
985 ret = ceph_msgr_init();
986 if (ret < 0)
987 goto out_debugfs;
988
989 ret = init_caches();
990 if (ret)
991 goto out_msgr;
992
993 ceph_caps_init();
994
995 ret = register_filesystem(&ceph_fs_type);
996 if (ret)
997 goto out_icache;
998
999 pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
1000 CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
1001 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
1002 return 0;
1003
1004out_icache:
1005 destroy_caches();
1006out_msgr:
1007 ceph_msgr_exit();
1008out_debugfs:
1009 ceph_debugfs_cleanup();
1010out:
1011 return ret;
1012}
1013
1014static void __exit exit_ceph(void)
1015{
1016 dout("exit_ceph\n");
1017 unregister_filesystem(&ceph_fs_type);
1018 ceph_caps_finalize();
1019 destroy_caches();
1020 ceph_msgr_exit();
1021 ceph_debugfs_cleanup();
1022}
1023
1024module_init(init_ceph);
1025module_exit(exit_ceph);
1026
1027MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
1028MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
1029MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
1030MODULE_DESCRIPTION("Ceph filesystem for Linux");
1031MODULE_LICENSE("GPL");
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 000000000000..e30dfbb056c3
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,901 @@
1#ifndef _FS_CEPH_SUPER_H
2#define _FS_CEPH_SUPER_H
3
4#include "ceph_debug.h"
5
6#include <asm/unaligned.h>
7#include <linux/backing-dev.h>
8#include <linux/completion.h>
9#include <linux/exportfs.h>
10#include <linux/fs.h>
11#include <linux/mempool.h>
12#include <linux/pagemap.h>
13#include <linux/wait.h>
14#include <linux/writeback.h>
15#include <linux/slab.h>
16
17#include "types.h"
18#include "messenger.h"
19#include "msgpool.h"
20#include "mon_client.h"
21#include "mds_client.h"
22#include "osd_client.h"
23#include "ceph_fs.h"
24
25/* f_type in struct statfs */
26#define CEPH_SUPER_MAGIC 0x00c36400
27
28/* large granularity for statfs utilization stats to facilitate
29 * large volume sizes on 32-bit machines. */
30#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
31#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
32
33/*
34 * mount options
35 */
36#define CEPH_OPT_FSID (1<<0)
37#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
38#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
39#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
40#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
41#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
42#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
43
44#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES)
45
46#define ceph_set_opt(client, opt) \
47 (client)->mount_args->flags |= CEPH_OPT_##opt;
48#define ceph_test_opt(client, opt) \
49 (!!((client)->mount_args->flags & CEPH_OPT_##opt))
50
51
52struct ceph_mount_args {
53 int sb_flags;
54 int num_mon;
55 struct ceph_entity_addr *mon_addr;
56 int flags;
57 int mount_timeout;
58 int osd_idle_ttl;
59 int caps_wanted_delay_min, caps_wanted_delay_max;
60 struct ceph_fsid fsid;
61 struct ceph_entity_addr my_addr;
62 int wsize;
63 int rsize; /* max readahead */
64 int max_readdir; /* max readdir size */
65 int congestion_kb; /* max readdir size */
66 int osd_timeout;
67 int osd_keepalive_timeout;
68 char *snapdir_name; /* default ".snap" */
69 char *name;
70 char *secret;
71 int cap_release_safety;
72};
73
74/*
75 * defaults
76 */
77#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
78#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
79#define CEPH_OSD_KEEPALIVE_DEFAULT 5
80#define CEPH_OSD_IDLE_TTL_DEFAULT 60
81#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
82
83#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
84#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
85
86#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
87#define CEPH_AUTH_NAME_DEFAULT "guest"
88
89/*
90 * Delay telling the MDS we no longer want caps, in case we reopen
91 * the file. Delay a minimum amount of time, even if we send a cap
92 * message for some other reason. Otherwise, take the oppotunity to
93 * update the mds to avoid sending another message later.
94 */
95#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
96#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
97
98
99/* mount state */
100enum {
101 CEPH_MOUNT_MOUNTING,
102 CEPH_MOUNT_MOUNTED,
103 CEPH_MOUNT_UNMOUNTING,
104 CEPH_MOUNT_UNMOUNTED,
105 CEPH_MOUNT_SHUTDOWN,
106};
107
108/*
109 * subtract jiffies
110 */
111static inline unsigned long time_sub(unsigned long a, unsigned long b)
112{
113 BUG_ON(time_after(b, a));
114 return (long)a - (long)b;
115}
116
117/*
118 * per-filesystem client state
119 *
120 * possibly shared by multiple mount points, if they are
121 * mounting the same ceph filesystem/cluster.
122 */
123struct ceph_client {
124 struct ceph_fsid fsid;
125 bool have_fsid;
126
127 struct mutex mount_mutex; /* serialize mount attempts */
128 struct ceph_mount_args *mount_args;
129
130 struct super_block *sb;
131
132 unsigned long mount_state;
133 wait_queue_head_t auth_wq;
134
135 int auth_err;
136
137 int min_caps; /* min caps i added */
138
139 struct ceph_messenger *msgr; /* messenger instance */
140 struct ceph_mon_client monc;
141 struct ceph_mds_client mdsc;
142 struct ceph_osd_client osdc;
143
144 /* writeback */
145 mempool_t *wb_pagevec_pool;
146 struct workqueue_struct *wb_wq;
147 struct workqueue_struct *pg_inv_wq;
148 struct workqueue_struct *trunc_wq;
149 atomic_long_t writeback_count;
150
151 struct backing_dev_info backing_dev_info;
152
153#ifdef CONFIG_DEBUG_FS
154 struct dentry *debugfs_monmap;
155 struct dentry *debugfs_mdsmap, *debugfs_osdmap;
156 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
157 struct dentry *debugfs_congestion_kb;
158 struct dentry *debugfs_bdi;
159#endif
160};
161
162static inline struct ceph_client *ceph_client(struct super_block *sb)
163{
164 return sb->s_fs_info;
165}
166
167
168/*
169 * File i/o capability. This tracks shared state with the metadata
170 * server that allows us to cache or writeback attributes or to read
171 * and write data. For any given inode, we should have one or more
172 * capabilities, one issued by each metadata server, and our
173 * cumulative access is the OR of all issued capabilities.
174 *
175 * Each cap is referenced by the inode's i_caps rbtree and by per-mds
176 * session capability lists.
177 */
178struct ceph_cap {
179 struct ceph_inode_info *ci;
180 struct rb_node ci_node; /* per-ci cap tree */
181 struct ceph_mds_session *session;
182 struct list_head session_caps; /* per-session caplist */
183 int mds;
184 u64 cap_id; /* unique cap id (mds provided) */
185 int issued; /* latest, from the mds */
186 int implemented; /* implemented superset of issued (for revocation) */
187 int mds_wanted;
188 u32 seq, issue_seq, mseq;
189 u32 cap_gen; /* active/stale cycle */
190 unsigned long last_used;
191 struct list_head caps_item;
192};
193
194#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
195#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
196#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
197
198/*
199 * Snapped cap state that is pending flush to mds. When a snapshot occurs,
200 * we first complete any in-process sync writes and writeback any dirty
201 * data before flushing the snapped state (tracked here) back to the MDS.
202 */
203struct ceph_cap_snap {
204 atomic_t nref;
205 struct ceph_inode_info *ci;
206 struct list_head ci_item, flushing_item;
207
208 u64 follows, flush_tid;
209 int issued, dirty;
210 struct ceph_snap_context *context;
211
212 mode_t mode;
213 uid_t uid;
214 gid_t gid;
215
216 void *xattr_blob;
217 int xattr_len;
218 u64 xattr_version;
219
220 u64 size;
221 struct timespec mtime, atime, ctime;
222 u64 time_warp_seq;
223 int writing; /* a sync write is still in progress */
224 int dirty_pages; /* dirty pages awaiting writeback */
225};
226
227static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
228{
229 if (atomic_dec_and_test(&capsnap->nref))
230 kfree(capsnap);
231}
232
233/*
234 * The frag tree describes how a directory is fragmented, potentially across
235 * multiple metadata servers. It is also used to indicate points where
236 * metadata authority is delegated, and whether/where metadata is replicated.
237 *
238 * A _leaf_ frag will be present in the i_fragtree IFF there is
239 * delegation info. That is, if mds >= 0 || ndist > 0.
240 */
241#define CEPH_MAX_DIRFRAG_REP 4
242
243struct ceph_inode_frag {
244 struct rb_node node;
245
246 /* fragtree state */
247 u32 frag;
248 int split_by; /* i.e. 2^(split_by) children */
249
250 /* delegation and replication info */
251 int mds; /* -1 if same authority as parent */
252 int ndist; /* >0 if replicated */
253 int dist[CEPH_MAX_DIRFRAG_REP];
254};
255
256/*
257 * We cache inode xattrs as an encoded blob until they are first used,
258 * at which point we parse them into an rbtree.
259 */
260struct ceph_inode_xattr {
261 struct rb_node node;
262
263 const char *name;
264 int name_len;
265 const char *val;
266 int val_len;
267 int dirty;
268
269 int should_free_name;
270 int should_free_val;
271};
272
273struct ceph_inode_xattrs_info {
274 /*
275 * (still encoded) xattr blob. we avoid the overhead of parsing
276 * this until someone actually calls getxattr, etc.
277 *
278 * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
279 * NULL means we don't know.
280 */
281 struct ceph_buffer *blob, *prealloc_blob;
282
283 struct rb_root index;
284 bool dirty;
285 int count;
286 int names_size;
287 int vals_size;
288 u64 version, index_version;
289};
290
291/*
292 * Ceph inode.
293 */
294#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
295#define CEPH_I_NODELAY 4 /* do not delay cap release */
296#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
297#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
298
299struct ceph_inode_info {
300 struct ceph_vino i_vino; /* ceph ino + snap */
301
302 u64 i_version;
303 u32 i_time_warp_seq;
304
305 unsigned i_ceph_flags;
306 unsigned long i_release_count;
307
308 struct ceph_file_layout i_layout;
309 char *i_symlink;
310
311 /* for dirs */
312 struct timespec i_rctime;
313 u64 i_rbytes, i_rfiles, i_rsubdirs;
314 u64 i_files, i_subdirs;
315 u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */
316
317 struct rb_root i_fragtree;
318 struct mutex i_fragtree_mutex;
319
320 struct ceph_inode_xattrs_info i_xattrs;
321
322 /* capabilities. protected _both_ by i_lock and cap->session's
323 * s_mutex. */
324 struct rb_root i_caps; /* cap list */
325 struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
326 unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
327 struct list_head i_dirty_item, i_flushing_item;
328 u64 i_cap_flush_seq;
329 /* we need to track cap writeback on a per-cap-bit basis, to allow
330 * overlapping, pipelined cap flushes to the mds. we can probably
331 * reduce the tid to 8 bits if we're concerned about inode size. */
332 u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
333 wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
334 unsigned long i_hold_caps_min; /* jiffies */
335 unsigned long i_hold_caps_max; /* jiffies */
336 struct list_head i_cap_delay_list; /* for delayed cap release to mds */
337 int i_cap_exporting_mds; /* to handle cap migration between */
338 unsigned i_cap_exporting_mseq; /* mds's. */
339 unsigned i_cap_exporting_issued;
340 struct ceph_cap_reservation i_cap_migration_resv;
341 struct list_head i_cap_snaps; /* snapped state pending flush to mds */
342 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */
343 unsigned i_snap_caps; /* cap bits for snapped files */
344
345 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
346
347 u32 i_truncate_seq; /* last truncate to smaller size */
348 u64 i_truncate_size; /* and the size we last truncated down to */
349 int i_truncate_pending; /* still need to call vmtruncate */
350
351 u64 i_max_size; /* max file size authorized by mds */
352 u64 i_reported_size; /* (max_)size reported to or requested of mds */
353 u64 i_wanted_max_size; /* offset we'd like to write too */
354 u64 i_requested_max_size; /* max_size we've requested */
355
356 /* held references to caps */
357 int i_pin_ref;
358 int i_rd_ref, i_rdcache_ref, i_wr_ref;
359 int i_wrbuffer_ref, i_wrbuffer_ref_head;
360 u32 i_shared_gen; /* increment each time we get FILE_SHARED */
361 u32 i_rdcache_gen; /* we increment this each time we get
362 FILE_CACHE. If it's non-zero, we
363 _may_ have cached pages. */
364 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
365
366 struct list_head i_unsafe_writes; /* uncommitted sync writes */
367 struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
368 spinlock_t i_unsafe_lock;
369
370 struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
371 int i_snap_realm_counter; /* snap realm (if caps) */
372 struct list_head i_snap_realm_item;
373 struct list_head i_snap_flush_item;
374
375 struct work_struct i_wb_work; /* writeback work */
376 struct work_struct i_pg_inv_work; /* page invalidation work */
377
378 struct work_struct i_vmtruncate_work;
379
380 struct inode vfs_inode; /* at end */
381};
382
383static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
384{
385 return container_of(inode, struct ceph_inode_info, vfs_inode);
386}
387
388static inline void ceph_i_clear(struct inode *inode, unsigned mask)
389{
390 struct ceph_inode_info *ci = ceph_inode(inode);
391
392 spin_lock(&inode->i_lock);
393 ci->i_ceph_flags &= ~mask;
394 spin_unlock(&inode->i_lock);
395}
396
397static inline void ceph_i_set(struct inode *inode, unsigned mask)
398{
399 struct ceph_inode_info *ci = ceph_inode(inode);
400
401 spin_lock(&inode->i_lock);
402 ci->i_ceph_flags |= mask;
403 spin_unlock(&inode->i_lock);
404}
405
406static inline bool ceph_i_test(struct inode *inode, unsigned mask)
407{
408 struct ceph_inode_info *ci = ceph_inode(inode);
409 bool r;
410
411 smp_mb();
412 r = (ci->i_ceph_flags & mask) == mask;
413 return r;
414}
415
416
417/* find a specific frag @f */
418extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
419 u32 f);
420
421/*
422 * choose fragment for value @v. copy frag content to pfrag, if leaf
423 * exists
424 */
425extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
426 struct ceph_inode_frag *pfrag,
427 int *found);
428
429/*
430 * Ceph dentry state
431 */
432struct ceph_dentry_info {
433 struct ceph_mds_session *lease_session;
434 u32 lease_gen, lease_shared_gen;
435 u32 lease_seq;
436 unsigned long lease_renew_after, lease_renew_from;
437 struct list_head lru;
438 struct dentry *dentry;
439 u64 time;
440 u64 offset;
441};
442
443static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
444{
445 return (struct ceph_dentry_info *)dentry->d_fsdata;
446}
447
448static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
449{
450 return ((loff_t)frag << 32) | (loff_t)off;
451}
452
453/*
454 * ino_t is <64 bits on many architectures, blech.
455 *
456 * don't include snap in ino hash, at least for now.
457 */
458static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
459{
460 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
461#if BITS_PER_LONG == 32
462 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
463 if (!ino)
464 ino = 1;
465#endif
466 return ino;
467}
468
469static inline int ceph_set_ino_cb(struct inode *inode, void *data)
470{
471 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
472 inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
473 return 0;
474}
475
476static inline struct ceph_vino ceph_vino(struct inode *inode)
477{
478 return ceph_inode(inode)->i_vino;
479}
480
481/* for printf-style formatting */
482#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
483
484static inline u64 ceph_ino(struct inode *inode)
485{
486 return ceph_inode(inode)->i_vino.ino;
487}
488static inline u64 ceph_snap(struct inode *inode)
489{
490 return ceph_inode(inode)->i_vino.snap;
491}
492
493static inline int ceph_ino_compare(struct inode *inode, void *data)
494{
495 struct ceph_vino *pvino = (struct ceph_vino *)data;
496 struct ceph_inode_info *ci = ceph_inode(inode);
497 return ci->i_vino.ino == pvino->ino &&
498 ci->i_vino.snap == pvino->snap;
499}
500
501static inline struct inode *ceph_find_inode(struct super_block *sb,
502 struct ceph_vino vino)
503{
504 ino_t t = ceph_vino_to_ino(vino);
505 return ilookup5(sb, t, ceph_ino_compare, &vino);
506}
507
508
509/*
510 * caps helpers
511 */
512static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
513{
514 return !RB_EMPTY_ROOT(&ci->i_caps);
515}
516
517extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
518extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
519extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
520 struct ceph_cap *cap);
521
522static inline int ceph_caps_issued(struct ceph_inode_info *ci)
523{
524 int issued;
525 spin_lock(&ci->vfs_inode.i_lock);
526 issued = __ceph_caps_issued(ci, NULL);
527 spin_unlock(&ci->vfs_inode.i_lock);
528 return issued;
529}
530
531static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
532 int touch)
533{
534 int r;
535 spin_lock(&ci->vfs_inode.i_lock);
536 r = __ceph_caps_issued_mask(ci, mask, touch);
537 spin_unlock(&ci->vfs_inode.i_lock);
538 return r;
539}
540
541static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
542{
543 return ci->i_dirty_caps | ci->i_flushing_caps;
544}
545extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
546
547extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
548extern int __ceph_caps_used(struct ceph_inode_info *ci);
549
550extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
551
552/*
553 * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
554 */
555static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
556{
557 int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
558 if (w & CEPH_CAP_FILE_BUFFER)
559 w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
560 return w;
561}
562
563/* what the mds thinks we want */
564extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
565
566extern void ceph_caps_init(void);
567extern void ceph_caps_finalize(void);
568extern void ceph_adjust_min_caps(int delta);
569extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
570extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
571extern void ceph_reservation_status(struct ceph_client *client,
572 int *total, int *avail, int *used,
573 int *reserved, int *min);
574
575static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
576{
577 return (struct ceph_client *)inode->i_sb->s_fs_info;
578}
579
580static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
581{
582 return (struct ceph_client *)sb->s_fs_info;
583}
584
585
586/*
587 * we keep buffered readdir results attached to file->private_data
588 */
589struct ceph_file_info {
590 int fmode; /* initialized on open */
591
592 /* readdir: position within the dir */
593 u32 frag;
594 struct ceph_mds_request *last_readdir;
595 int at_end;
596
597 /* readdir: position within a frag */
598 unsigned offset; /* offset of last chunk, adjusted for . and .. */
599 u64 next_offset; /* offset of next chunk (last_name's + 1) */
600 char *last_name; /* last entry in previous chunk */
601 struct dentry *dentry; /* next dentry (for dcache readdir) */
602 unsigned long dir_release_count;
603
604 /* used for -o dirstat read() on directory thing */
605 char *dir_info;
606 int dir_info_len;
607};
608
609
610
611/*
612 * snapshots
613 */
614
615/*
616 * A "snap context" is the set of existing snapshots when we
617 * write data. It is used by the OSD to guide its COW behavior.
618 *
619 * The ceph_snap_context is refcounted, and attached to each dirty
620 * page, indicating which context the dirty data belonged when it was
621 * dirtied.
622 */
623struct ceph_snap_context {
624 atomic_t nref;
625 u64 seq;
626 int num_snaps;
627 u64 snaps[];
628};
629
630static inline struct ceph_snap_context *
631ceph_get_snap_context(struct ceph_snap_context *sc)
632{
633 /*
634 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
635 atomic_read(&sc->nref)+1);
636 */
637 if (sc)
638 atomic_inc(&sc->nref);
639 return sc;
640}
641
642static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
643{
644 if (!sc)
645 return;
646 /*
647 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
648 atomic_read(&sc->nref)-1);
649 */
650 if (atomic_dec_and_test(&sc->nref)) {
651 /*printk(" deleting snap_context %p\n", sc);*/
652 kfree(sc);
653 }
654}
655
656/*
657 * A "snap realm" describes a subset of the file hierarchy sharing
658 * the same set of snapshots that apply to it. The realms themselves
659 * are organized into a hierarchy, such that children inherit (some of)
660 * the snapshots of their parents.
661 *
662 * All inodes within the realm that have capabilities are linked into a
663 * per-realm list.
664 */
665struct ceph_snap_realm {
666 u64 ino;
667 atomic_t nref;
668 struct rb_node node;
669
670 u64 created, seq;
671 u64 parent_ino;
672 u64 parent_since; /* snapid when our current parent became so */
673
674 u64 *prior_parent_snaps; /* snaps inherited from any parents we */
675 int num_prior_parent_snaps; /* had prior to parent_since */
676 u64 *snaps; /* snaps specific to this realm */
677 int num_snaps;
678
679 struct ceph_snap_realm *parent;
680 struct list_head children; /* list of child realms */
681 struct list_head child_item;
682
683 struct list_head empty_item; /* if i have ref==0 */
684
685 /* the current set of snaps for this realm */
686 struct ceph_snap_context *cached_context;
687
688 struct list_head inodes_with_caps;
689 spinlock_t inodes_with_caps_lock;
690};
691
692
693
694/*
695 * calculate the number of pages a given length and offset map onto,
696 * if we align the data.
697 */
698static inline int calc_pages_for(u64 off, u64 len)
699{
700 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
701 (off >> PAGE_CACHE_SHIFT);
702}
703
704
705
706/* snap.c */
707struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
708 u64 ino);
709extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
710 struct ceph_snap_realm *realm);
711extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
712 struct ceph_snap_realm *realm);
713extern int ceph_update_snap_trace(struct ceph_mds_client *m,
714 void *p, void *e, bool deletion);
715extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
716 struct ceph_mds_session *session,
717 struct ceph_msg *msg);
718extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
719extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
720 struct ceph_cap_snap *capsnap);
721extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
722
723/*
724 * a cap_snap is "pending" if it is still awaiting an in-progress
725 * sync write (that may/may not still update size, mtime, etc.).
726 */
727static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
728{
729 return !list_empty(&ci->i_cap_snaps) &&
730 list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
731 ci_item)->writing;
732}
733
734
735/* super.c */
736extern struct kmem_cache *ceph_inode_cachep;
737extern struct kmem_cache *ceph_cap_cachep;
738extern struct kmem_cache *ceph_dentry_cachep;
739extern struct kmem_cache *ceph_file_cachep;
740
741extern const char *ceph_msg_type_name(int type);
742extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
743
744#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
745 "%02x%02x%02x%02x%02x%02x"
746#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
747 (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \
748 (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \
749 (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
750
751/* inode.c */
752extern const struct inode_operations ceph_file_iops;
753
754extern struct inode *ceph_alloc_inode(struct super_block *sb);
755extern void ceph_destroy_inode(struct inode *inode);
756
757extern struct inode *ceph_get_inode(struct super_block *sb,
758 struct ceph_vino vino);
759extern struct inode *ceph_get_snapdir(struct inode *parent);
760extern int ceph_fill_file_size(struct inode *inode, int issued,
761 u32 truncate_seq, u64 truncate_size, u64 size);
762extern void ceph_fill_file_time(struct inode *inode, int issued,
763 u64 time_warp_seq, struct timespec *ctime,
764 struct timespec *mtime, struct timespec *atime);
765extern int ceph_fill_trace(struct super_block *sb,
766 struct ceph_mds_request *req,
767 struct ceph_mds_session *session);
768extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
769 struct ceph_mds_session *session);
770
771extern int ceph_inode_holds_cap(struct inode *inode, int mask);
772
773extern int ceph_inode_set_size(struct inode *inode, loff_t size);
774extern void __ceph_do_pending_vmtruncate(struct inode *inode);
775extern void ceph_queue_vmtruncate(struct inode *inode);
776
777extern void ceph_queue_invalidate(struct inode *inode);
778extern void ceph_queue_writeback(struct inode *inode);
779
780extern int ceph_do_getattr(struct inode *inode, int mask);
781extern int ceph_permission(struct inode *inode, int mask);
782extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
783extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
784 struct kstat *stat);
785
786/* xattr.c */
787extern int ceph_setxattr(struct dentry *, const char *, const void *,
788 size_t, int);
789extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
790extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
791extern int ceph_removexattr(struct dentry *, const char *);
792extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
793extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
794
795/* caps.c */
796extern const char *ceph_cap_string(int c);
797extern void ceph_handle_caps(struct ceph_mds_session *session,
798 struct ceph_msg *msg);
799extern int ceph_add_cap(struct inode *inode,
800 struct ceph_mds_session *session, u64 cap_id,
801 int fmode, unsigned issued, unsigned wanted,
802 unsigned cap, unsigned seq, u64 realmino, int flags,
803 struct ceph_cap_reservation *caps_reservation);
804extern void __ceph_remove_cap(struct ceph_cap *cap);
805static inline void ceph_remove_cap(struct ceph_cap *cap)
806{
807 struct inode *inode = &cap->ci->vfs_inode;
808 spin_lock(&inode->i_lock);
809 __ceph_remove_cap(cap);
810 spin_unlock(&inode->i_lock);
811}
812extern void ceph_put_cap(struct ceph_cap *cap);
813
814extern void ceph_queue_caps_release(struct inode *inode);
815extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
816extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
817extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
818 struct ceph_mds_session *session);
819extern int ceph_get_cap_mds(struct inode *inode);
820extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
821extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
822extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
823 struct ceph_snap_context *snapc);
824extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
825 struct ceph_mds_session **psession);
826extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
827 struct ceph_mds_session *session);
828extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
829extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
830
831extern int ceph_encode_inode_release(void **p, struct inode *inode,
832 int mds, int drop, int unless, int force);
833extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
834 int mds, int drop, int unless);
835
836extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
837 int *got, loff_t endoff);
838
839/* for counting open files by mode */
840static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
841{
842 ci->i_nr_by_mode[mode]++;
843}
844extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
845
846/* addr.c */
847extern const struct address_space_operations ceph_aops;
848extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
849
850/* file.c */
851extern const struct file_operations ceph_file_fops;
852extern const struct address_space_operations ceph_aops;
853extern int ceph_open(struct inode *inode, struct file *file);
854extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
855 struct nameidata *nd, int mode,
856 int locked_dir);
857extern int ceph_release(struct inode *inode, struct file *filp);
858extern void ceph_release_page_vector(struct page **pages, int num_pages);
859
860/* dir.c */
861extern const struct file_operations ceph_dir_fops;
862extern const struct inode_operations ceph_dir_iops;
863extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
864 ceph_snapdir_dentry_ops;
865
866extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
867extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
868 struct dentry *dentry, int err);
869
870extern void ceph_dentry_lru_add(struct dentry *dn);
871extern void ceph_dentry_lru_touch(struct dentry *dn);
872extern void ceph_dentry_lru_del(struct dentry *dn);
873
874/*
875 * our d_ops vary depending on whether the inode is live,
876 * snapshotted (read-only), or a virtual ".snap" directory.
877 */
878int ceph_init_dentry(struct dentry *dentry);
879
880
881/* ioctl.c */
882extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
883
884/* export.c */
885extern const struct export_operations ceph_export_ops;
886
887/* debugfs.c */
888extern int ceph_debugfs_init(void);
889extern void ceph_debugfs_cleanup(void);
890extern int ceph_debugfs_client_init(struct ceph_client *client);
891extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
892
893static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
894{
895 if (dentry && dentry->d_parent)
896 return dentry->d_parent->d_inode;
897
898 return NULL;
899}
900
901#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
new file mode 100644
index 000000000000..28b35a005ec2
--- /dev/null
+++ b/fs/ceph/types.h
@@ -0,0 +1,29 @@
1#ifndef _FS_CEPH_TYPES_H
2#define _FS_CEPH_TYPES_H
3
4/* needed before including ceph_fs.h */
5#include <linux/in.h>
6#include <linux/types.h>
7#include <linux/fcntl.h>
8#include <linux/string.h>
9
10#include "ceph_fs.h"
11#include "ceph_frag.h"
12#include "ceph_hash.h"
13
14/*
15 * Identify inodes by both their ino AND snapshot id (a u64).
16 */
17struct ceph_vino {
18 u64 ino;
19 u64 snap;
20};
21
22
23/* context for the caps reservation mechanism */
24struct ceph_cap_reservation {
25 int count;
26};
27
28
29#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 000000000000..2845422907fc
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,845 @@
1#include "ceph_debug.h"
2#include "super.h"
3#include "decode.h"
4
5#include <linux/xattr.h>
6#include <linux/slab.h>
7
8static bool ceph_is_valid_xattr(const char *name)
9{
10 return !strncmp(name, XATTR_SECURITY_PREFIX,
11 XATTR_SECURITY_PREFIX_LEN) ||
12 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
13 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
14}
15
16/*
17 * These define virtual xattrs exposing the recursive directory
18 * statistics and layout metadata.
19 */
20struct ceph_vxattr_cb {
21 bool readonly;
22 char *name;
23 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
24 size_t size);
25};
26
27/* directories */
28
29static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
30 size_t size)
31{
32 return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
33}
34
35static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
36 size_t size)
37{
38 return snprintf(val, size, "%lld", ci->i_files);
39}
40
41static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
42 size_t size)
43{
44 return snprintf(val, size, "%lld", ci->i_subdirs);
45}
46
47static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
48 size_t size)
49{
50 return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
51}
52
53static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
54 size_t size)
55{
56 return snprintf(val, size, "%lld", ci->i_rfiles);
57}
58
59static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
60 size_t size)
61{
62 return snprintf(val, size, "%lld", ci->i_rsubdirs);
63}
64
65static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
66 size_t size)
67{
68 return snprintf(val, size, "%lld", ci->i_rbytes);
69}
70
71static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
72 size_t size)
73{
74 return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
75 (long)ci->i_rctime.tv_nsec);
76}
77
78static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
79 { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
80 { true, "user.ceph.dir.files", ceph_vxattrcb_files},
81 { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
82 { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
83 { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
84 { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
85 { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
86 { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
87 { true, NULL, NULL }
88};
89
90/* files */
91
92static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
93 size_t size)
94{
95 int ret;
96
97 ret = snprintf(val, size,
98 "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
99 (unsigned long long)ceph_file_layout_su(ci->i_layout),
100 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
101 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
102 if (ceph_file_layout_pg_preferred(ci->i_layout))
103 ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
104 (unsigned long long)ceph_file_layout_pg_preferred(
105 ci->i_layout));
106 return ret;
107}
108
109static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
110 { true, "user.ceph.layout", ceph_vxattrcb_layout},
111 { NULL, NULL }
112};
113
114static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
115{
116 if (S_ISDIR(inode->i_mode))
117 return ceph_dir_vxattrs;
118 else if (S_ISREG(inode->i_mode))
119 return ceph_file_vxattrs;
120 return NULL;
121}
122
123static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
124 const char *name)
125{
126 do {
127 if (strcmp(vxattr->name, name) == 0)
128 return vxattr;
129 vxattr++;
130 } while (vxattr->name);
131 return NULL;
132}
133
134static int __set_xattr(struct ceph_inode_info *ci,
135 const char *name, int name_len,
136 const char *val, int val_len,
137 int dirty,
138 int should_free_name, int should_free_val,
139 struct ceph_inode_xattr **newxattr)
140{
141 struct rb_node **p;
142 struct rb_node *parent = NULL;
143 struct ceph_inode_xattr *xattr = NULL;
144 int c;
145 int new = 0;
146
147 p = &ci->i_xattrs.index.rb_node;
148 while (*p) {
149 parent = *p;
150 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
151 c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
152 if (c < 0)
153 p = &(*p)->rb_left;
154 else if (c > 0)
155 p = &(*p)->rb_right;
156 else {
157 if (name_len == xattr->name_len)
158 break;
159 else if (name_len < xattr->name_len)
160 p = &(*p)->rb_left;
161 else
162 p = &(*p)->rb_right;
163 }
164 xattr = NULL;
165 }
166
167 if (!xattr) {
168 new = 1;
169 xattr = *newxattr;
170 xattr->name = name;
171 xattr->name_len = name_len;
172 xattr->should_free_name = should_free_name;
173
174 ci->i_xattrs.count++;
175 dout("__set_xattr count=%d\n", ci->i_xattrs.count);
176 } else {
177 kfree(*newxattr);
178 *newxattr = NULL;
179 if (xattr->should_free_val)
180 kfree((void *)xattr->val);
181
182 if (should_free_name) {
183 kfree((void *)name);
184 name = xattr->name;
185 }
186 ci->i_xattrs.names_size -= xattr->name_len;
187 ci->i_xattrs.vals_size -= xattr->val_len;
188 }
189 if (!xattr) {
190 pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
191 &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
192 xattr->val);
193 return -ENOMEM;
194 }
195 ci->i_xattrs.names_size += name_len;
196 ci->i_xattrs.vals_size += val_len;
197 if (val)
198 xattr->val = val;
199 else
200 xattr->val = "";
201
202 xattr->val_len = val_len;
203 xattr->dirty = dirty;
204 xattr->should_free_val = (val && should_free_val);
205
206 if (new) {
207 rb_link_node(&xattr->node, parent, p);
208 rb_insert_color(&xattr->node, &ci->i_xattrs.index);
209 dout("__set_xattr_val p=%p\n", p);
210 }
211
212 dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
213 ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
214
215 return 0;
216}
217
218static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
219 const char *name)
220{
221 struct rb_node **p;
222 struct rb_node *parent = NULL;
223 struct ceph_inode_xattr *xattr = NULL;
224 int c;
225
226 p = &ci->i_xattrs.index.rb_node;
227 while (*p) {
228 parent = *p;
229 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
230 c = strncmp(name, xattr->name, xattr->name_len);
231 if (c < 0)
232 p = &(*p)->rb_left;
233 else if (c > 0)
234 p = &(*p)->rb_right;
235 else {
236 dout("__get_xattr %s: found %.*s\n", name,
237 xattr->val_len, xattr->val);
238 return xattr;
239 }
240 }
241
242 dout("__get_xattr %s: not found\n", name);
243
244 return NULL;
245}
246
247static void __free_xattr(struct ceph_inode_xattr *xattr)
248{
249 BUG_ON(!xattr);
250
251 if (xattr->should_free_name)
252 kfree((void *)xattr->name);
253 if (xattr->should_free_val)
254 kfree((void *)xattr->val);
255
256 kfree(xattr);
257}
258
259static int __remove_xattr(struct ceph_inode_info *ci,
260 struct ceph_inode_xattr *xattr)
261{
262 if (!xattr)
263 return -EOPNOTSUPP;
264
265 rb_erase(&xattr->node, &ci->i_xattrs.index);
266
267 if (xattr->should_free_name)
268 kfree((void *)xattr->name);
269 if (xattr->should_free_val)
270 kfree((void *)xattr->val);
271
272 ci->i_xattrs.names_size -= xattr->name_len;
273 ci->i_xattrs.vals_size -= xattr->val_len;
274 ci->i_xattrs.count--;
275 kfree(xattr);
276
277 return 0;
278}
279
280static int __remove_xattr_by_name(struct ceph_inode_info *ci,
281 const char *name)
282{
283 struct rb_node **p;
284 struct ceph_inode_xattr *xattr;
285 int err;
286
287 p = &ci->i_xattrs.index.rb_node;
288 xattr = __get_xattr(ci, name);
289 err = __remove_xattr(ci, xattr);
290 return err;
291}
292
293static char *__copy_xattr_names(struct ceph_inode_info *ci,
294 char *dest)
295{
296 struct rb_node *p;
297 struct ceph_inode_xattr *xattr = NULL;
298
299 p = rb_first(&ci->i_xattrs.index);
300 dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
301
302 while (p) {
303 xattr = rb_entry(p, struct ceph_inode_xattr, node);
304 memcpy(dest, xattr->name, xattr->name_len);
305 dest[xattr->name_len] = '\0';
306
307 dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
308 xattr->name_len, ci->i_xattrs.names_size);
309
310 dest += xattr->name_len + 1;
311 p = rb_next(p);
312 }
313
314 return dest;
315}
316
317void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
318{
319 struct rb_node *p, *tmp;
320 struct ceph_inode_xattr *xattr = NULL;
321
322 p = rb_first(&ci->i_xattrs.index);
323
324 dout("__ceph_destroy_xattrs p=%p\n", p);
325
326 while (p) {
327 xattr = rb_entry(p, struct ceph_inode_xattr, node);
328 tmp = p;
329 p = rb_next(tmp);
330 dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
331 xattr->name_len, xattr->name);
332 rb_erase(tmp, &ci->i_xattrs.index);
333
334 __free_xattr(xattr);
335 }
336
337 ci->i_xattrs.names_size = 0;
338 ci->i_xattrs.vals_size = 0;
339 ci->i_xattrs.index_version = 0;
340 ci->i_xattrs.count = 0;
341 ci->i_xattrs.index = RB_ROOT;
342}
343
344static int __build_xattrs(struct inode *inode)
345{
346 u32 namelen;
347 u32 numattr = 0;
348 void *p, *end;
349 u32 len;
350 const char *name, *val;
351 struct ceph_inode_info *ci = ceph_inode(inode);
352 int xattr_version;
353 struct ceph_inode_xattr **xattrs = NULL;
354 int err = 0;
355 int i;
356
357 dout("__build_xattrs() len=%d\n",
358 ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
359
360 if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
361 return 0; /* already built */
362
363 __ceph_destroy_xattrs(ci);
364
365start:
366 /* updated internal xattr rb tree */
367 if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
368 p = ci->i_xattrs.blob->vec.iov_base;
369 end = p + ci->i_xattrs.blob->vec.iov_len;
370 ceph_decode_32_safe(&p, end, numattr, bad);
371 xattr_version = ci->i_xattrs.version;
372 spin_unlock(&inode->i_lock);
373
374 xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
375 GFP_NOFS);
376 err = -ENOMEM;
377 if (!xattrs)
378 goto bad_lock;
379 memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
380 for (i = 0; i < numattr; i++) {
381 xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
382 GFP_NOFS);
383 if (!xattrs[i])
384 goto bad_lock;
385 }
386
387 spin_lock(&inode->i_lock);
388 if (ci->i_xattrs.version != xattr_version) {
389 /* lost a race, retry */
390 for (i = 0; i < numattr; i++)
391 kfree(xattrs[i]);
392 kfree(xattrs);
393 goto start;
394 }
395 err = -EIO;
396 while (numattr--) {
397 ceph_decode_32_safe(&p, end, len, bad);
398 namelen = len;
399 name = p;
400 p += len;
401 ceph_decode_32_safe(&p, end, len, bad);
402 val = p;
403 p += len;
404
405 err = __set_xattr(ci, name, namelen, val, len,
406 0, 0, 0, &xattrs[numattr]);
407
408 if (err < 0)
409 goto bad;
410 }
411 kfree(xattrs);
412 }
413 ci->i_xattrs.index_version = ci->i_xattrs.version;
414 ci->i_xattrs.dirty = false;
415
416 return err;
417bad_lock:
418 spin_lock(&inode->i_lock);
419bad:
420 if (xattrs) {
421 for (i = 0; i < numattr; i++)
422 kfree(xattrs[i]);
423 kfree(xattrs);
424 }
425 ci->i_xattrs.names_size = 0;
426 return err;
427}
428
429static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
430 int val_size)
431{
432 /*
433 * 4 bytes for the length, and additional 4 bytes per each xattr name,
434 * 4 bytes per each value
435 */
436 int size = 4 + ci->i_xattrs.count*(4 + 4) +
437 ci->i_xattrs.names_size +
438 ci->i_xattrs.vals_size;
439 dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
440 ci->i_xattrs.count, ci->i_xattrs.names_size,
441 ci->i_xattrs.vals_size);
442
443 if (name_size)
444 size += 4 + 4 + name_size + val_size;
445
446 return size;
447}
448
449/*
450 * If there are dirty xattrs, reencode xattrs into the prealloc_blob
451 * and swap into place.
452 */
453void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
454{
455 struct rb_node *p;
456 struct ceph_inode_xattr *xattr = NULL;
457 void *dest;
458
459 dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
460 if (ci->i_xattrs.dirty) {
461 int need = __get_required_blob_size(ci, 0, 0);
462
463 BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
464
465 p = rb_first(&ci->i_xattrs.index);
466 dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
467
468 ceph_encode_32(&dest, ci->i_xattrs.count);
469 while (p) {
470 xattr = rb_entry(p, struct ceph_inode_xattr, node);
471
472 ceph_encode_32(&dest, xattr->name_len);
473 memcpy(dest, xattr->name, xattr->name_len);
474 dest += xattr->name_len;
475 ceph_encode_32(&dest, xattr->val_len);
476 memcpy(dest, xattr->val, xattr->val_len);
477 dest += xattr->val_len;
478
479 p = rb_next(p);
480 }
481
482 /* adjust buffer len; it may be larger than we need */
483 ci->i_xattrs.prealloc_blob->vec.iov_len =
484 dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
485
486 if (ci->i_xattrs.blob)
487 ceph_buffer_put(ci->i_xattrs.blob);
488 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
489 ci->i_xattrs.prealloc_blob = NULL;
490 ci->i_xattrs.dirty = false;
491 }
492}
493
494ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
495 size_t size)
496{
497 struct inode *inode = dentry->d_inode;
498 struct ceph_inode_info *ci = ceph_inode(inode);
499 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
500 int err;
501 struct ceph_inode_xattr *xattr;
502 struct ceph_vxattr_cb *vxattr = NULL;
503
504 if (!ceph_is_valid_xattr(name))
505 return -ENODATA;
506
507 /* let's see if a virtual xattr was requested */
508 if (vxattrs)
509 vxattr = ceph_match_vxattr(vxattrs, name);
510
511 spin_lock(&inode->i_lock);
512 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
513 ci->i_xattrs.version, ci->i_xattrs.index_version);
514
515 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
516 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
517 goto get_xattr;
518 } else {
519 spin_unlock(&inode->i_lock);
520 /* get xattrs from mds (if we don't already have them) */
521 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
522 if (err)
523 return err;
524 }
525
526 spin_lock(&inode->i_lock);
527
528 if (vxattr && vxattr->readonly) {
529 err = vxattr->getxattr_cb(ci, value, size);
530 goto out;
531 }
532
533 err = __build_xattrs(inode);
534 if (err < 0)
535 goto out;
536
537get_xattr:
538 err = -ENODATA; /* == ENOATTR */
539 xattr = __get_xattr(ci, name);
540 if (!xattr) {
541 if (vxattr)
542 err = vxattr->getxattr_cb(ci, value, size);
543 goto out;
544 }
545
546 err = -ERANGE;
547 if (size && size < xattr->val_len)
548 goto out;
549
550 err = xattr->val_len;
551 if (size == 0)
552 goto out;
553
554 memcpy(value, xattr->val, xattr->val_len);
555
556out:
557 spin_unlock(&inode->i_lock);
558 return err;
559}
560
561ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
562{
563 struct inode *inode = dentry->d_inode;
564 struct ceph_inode_info *ci = ceph_inode(inode);
565 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
566 u32 vir_namelen = 0;
567 u32 namelen;
568 int err;
569 u32 len;
570 int i;
571
572 spin_lock(&inode->i_lock);
573 dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
574 ci->i_xattrs.version, ci->i_xattrs.index_version);
575
576 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
577 (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
578 goto list_xattr;
579 } else {
580 spin_unlock(&inode->i_lock);
581 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
582 if (err)
583 return err;
584 }
585
586 spin_lock(&inode->i_lock);
587
588 err = __build_xattrs(inode);
589 if (err < 0)
590 goto out;
591
592list_xattr:
593 vir_namelen = 0;
594 /* include virtual dir xattrs */
595 if (vxattrs)
596 for (i = 0; vxattrs[i].name; i++)
597 vir_namelen += strlen(vxattrs[i].name) + 1;
598 /* adding 1 byte per each variable due to the null termination */
599 namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
600 err = -ERANGE;
601 if (size && namelen > size)
602 goto out;
603
604 err = namelen;
605 if (size == 0)
606 goto out;
607
608 names = __copy_xattr_names(ci, names);
609
610 /* virtual xattr names, too */
611 if (vxattrs)
612 for (i = 0; vxattrs[i].name; i++) {
613 len = sprintf(names, "%s", vxattrs[i].name);
614 names += len + 1;
615 }
616
617out:
618 spin_unlock(&inode->i_lock);
619 return err;
620}
621
622static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
623 const char *value, size_t size, int flags)
624{
625 struct ceph_client *client = ceph_client(dentry->d_sb);
626 struct inode *inode = dentry->d_inode;
627 struct ceph_inode_info *ci = ceph_inode(inode);
628 struct inode *parent_inode = dentry->d_parent->d_inode;
629 struct ceph_mds_request *req;
630 struct ceph_mds_client *mdsc = &client->mdsc;
631 int err;
632 int i, nr_pages;
633 struct page **pages = NULL;
634 void *kaddr;
635
636 /* copy value into some pages */
637 nr_pages = calc_pages_for(0, size);
638 if (nr_pages) {
639 pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
640 if (!pages)
641 return -ENOMEM;
642 err = -ENOMEM;
643 for (i = 0; i < nr_pages; i++) {
644 pages[i] = alloc_page(GFP_NOFS);
645 if (!pages[i]) {
646 nr_pages = i;
647 goto out;
648 }
649 kaddr = kmap(pages[i]);
650 memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
651 min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
652 }
653 }
654
655 dout("setxattr value=%.*s\n", (int)size, value);
656
657 /* do request */
658 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
659 USE_AUTH_MDS);
660 if (IS_ERR(req)) {
661 err = PTR_ERR(req);
662 goto out;
663 }
664 req->r_inode = igrab(inode);
665 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
666 req->r_num_caps = 1;
667 req->r_args.setxattr.flags = cpu_to_le32(flags);
668 req->r_path2 = kstrdup(name, GFP_NOFS);
669
670 req->r_pages = pages;
671 req->r_num_pages = nr_pages;
672 req->r_data_len = size;
673
674 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
675 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
676 ceph_mdsc_put_request(req);
677 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
678
679out:
680 if (pages) {
681 for (i = 0; i < nr_pages; i++)
682 __free_page(pages[i]);
683 kfree(pages);
684 }
685 return err;
686}
687
688int ceph_setxattr(struct dentry *dentry, const char *name,
689 const void *value, size_t size, int flags)
690{
691 struct inode *inode = dentry->d_inode;
692 struct ceph_inode_info *ci = ceph_inode(inode);
693 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
694 int err;
695 int name_len = strlen(name);
696 int val_len = size;
697 char *newname = NULL;
698 char *newval = NULL;
699 struct ceph_inode_xattr *xattr = NULL;
700 int issued;
701 int required_blob_size;
702
703 if (ceph_snap(inode) != CEPH_NOSNAP)
704 return -EROFS;
705
706 if (!ceph_is_valid_xattr(name))
707 return -EOPNOTSUPP;
708
709 if (vxattrs) {
710 struct ceph_vxattr_cb *vxattr =
711 ceph_match_vxattr(vxattrs, name);
712 if (vxattr && vxattr->readonly)
713 return -EOPNOTSUPP;
714 }
715
716 /* preallocate memory for xattr name, value, index node */
717 err = -ENOMEM;
718 newname = kmalloc(name_len + 1, GFP_NOFS);
719 if (!newname)
720 goto out;
721 memcpy(newname, name, name_len + 1);
722
723 if (val_len) {
724 newval = kmalloc(val_len + 1, GFP_NOFS);
725 if (!newval)
726 goto out;
727 memcpy(newval, value, val_len);
728 newval[val_len] = '\0';
729 }
730
731 xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
732 if (!xattr)
733 goto out;
734
735 spin_lock(&inode->i_lock);
736retry:
737 issued = __ceph_caps_issued(ci, NULL);
738 if (!(issued & CEPH_CAP_XATTR_EXCL))
739 goto do_sync;
740 __build_xattrs(inode);
741
742 required_blob_size = __get_required_blob_size(ci, name_len, val_len);
743
744 if (!ci->i_xattrs.prealloc_blob ||
745 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
746 struct ceph_buffer *blob = NULL;
747
748 spin_unlock(&inode->i_lock);
749 dout(" preaallocating new blob size=%d\n", required_blob_size);
750 blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
751 if (!blob)
752 goto out;
753 spin_lock(&inode->i_lock);
754 if (ci->i_xattrs.prealloc_blob)
755 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
756 ci->i_xattrs.prealloc_blob = blob;
757 goto retry;
758 }
759
760 dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
761 err = __set_xattr(ci, newname, name_len, newval,
762 val_len, 1, 1, 1, &xattr);
763 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
764 ci->i_xattrs.dirty = true;
765 inode->i_ctime = CURRENT_TIME;
766 spin_unlock(&inode->i_lock);
767
768 return err;
769
770do_sync:
771 spin_unlock(&inode->i_lock);
772 err = ceph_sync_setxattr(dentry, name, value, size, flags);
773out:
774 kfree(newname);
775 kfree(newval);
776 kfree(xattr);
777 return err;
778}
779
780static int ceph_send_removexattr(struct dentry *dentry, const char *name)
781{
782 struct ceph_client *client = ceph_client(dentry->d_sb);
783 struct ceph_mds_client *mdsc = &client->mdsc;
784 struct inode *inode = dentry->d_inode;
785 struct inode *parent_inode = dentry->d_parent->d_inode;
786 struct ceph_mds_request *req;
787 int err;
788
789 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
790 USE_AUTH_MDS);
791 if (IS_ERR(req))
792 return PTR_ERR(req);
793 req->r_inode = igrab(inode);
794 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
795 req->r_num_caps = 1;
796 req->r_path2 = kstrdup(name, GFP_NOFS);
797
798 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
799 ceph_mdsc_put_request(req);
800 return err;
801}
802
803int ceph_removexattr(struct dentry *dentry, const char *name)
804{
805 struct inode *inode = dentry->d_inode;
806 struct ceph_inode_info *ci = ceph_inode(inode);
807 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
808 int issued;
809 int err;
810
811 if (ceph_snap(inode) != CEPH_NOSNAP)
812 return -EROFS;
813
814 if (!ceph_is_valid_xattr(name))
815 return -EOPNOTSUPP;
816
817 if (vxattrs) {
818 struct ceph_vxattr_cb *vxattr =
819 ceph_match_vxattr(vxattrs, name);
820 if (vxattr && vxattr->readonly)
821 return -EOPNOTSUPP;
822 }
823
824 spin_lock(&inode->i_lock);
825 __build_xattrs(inode);
826 issued = __ceph_caps_issued(ci, NULL);
827 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
828
829 if (!(issued & CEPH_CAP_XATTR_EXCL))
830 goto do_sync;
831
832 err = __remove_xattr_by_name(ceph_inode(inode), name);
833 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
834 ci->i_xattrs.dirty = true;
835 inode->i_ctime = CURRENT_TIME;
836
837 spin_unlock(&inode->i_lock);
838
839 return err;
840do_sync:
841 spin_unlock(&inode->i_lock);
842 err = ceph_send_removexattr(dentry, name);
843 return err;
844}
845
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 094ea65afc85..bc0025cdd1c9 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,8 @@
1Version 1.62
2------------
3Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened
4to more strictly handle corrupt frames.
5
1Version 1.61 6Version 1.61
2------------ 7------------
3Fix append problem to Samba servers (files opened with O_APPEND could 8Fix append problem to Samba servers (files opened with O_APPEND could
@@ -5,7 +10,9 @@ have duplicated data). Fix oops in cifs_lookup. Workaround problem
5mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session. 10mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session.
6Disable use of server inode numbers when server only 11Disable use of server inode numbers when server only
7partially supports them (e.g. for one server querying inode numbers on 12partially supports them (e.g. for one server querying inode numbers on
8FindFirst fails but QPathInfo queries works). 13FindFirst fails but QPathInfo queries works). Fix oops with dfs in
14cifs_put_smb_ses. Fix mmap to work on directio mounts (needed
15for OpenOffice when on forcedirectio mount e.g.)
9 16
10Version 1.60 17Version 1.60
11------------- 18-------------
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 20692fbfdb24..a20bea598933 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -136,7 +136,7 @@ asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val)
136 return 0; 136 return 0;
137 } 137 }
138 138
139 ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to lenght octet */ 139 ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */
140 if ((ch) == ASN1_ENUM) /* if ch value is ENUM, 0xa */ 140 if ((ch) == ASN1_ENUM) /* if ch value is ENUM, 0xa */
141 *val = *(++(ctx->pointer)); /* value has enum value */ 141 *val = *(++(ctx->pointer)); /* value has enum value */
142 else 142 else
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index fea9e898c4ba..78e4d2a3a68b 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -15,6 +15,7 @@
15#include <linux/dcache.h> 15#include <linux/dcache.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
18#include <linux/vfs.h> 19#include <linux/vfs.h>
19#include <linux/fs.h> 20#include <linux/fs.h>
20#include "cifsglob.h" 21#include "cifsglob.h"
@@ -54,7 +55,7 @@ void cifs_dfs_release_automount_timer(void)
54 * Extracts sharename form full UNC. 55 * Extracts sharename form full UNC.
55 * i.e. strips from UNC trailing path that is not part of share 56 * i.e. strips from UNC trailing path that is not part of share
56 * name and fixup missing '\' in the begining of DFS node refferal 57 * name and fixup missing '\' in the begining of DFS node refferal
57 * if neccessary. 58 * if necessary.
58 * Returns pointer to share name on success or ERR_PTR on error. 59 * Returns pointer to share name on success or ERR_PTR on error.
59 * Caller is responsible for freeing returned string. 60 * Caller is responsible for freeing returned string.
60 */ 61 */
@@ -269,7 +270,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
269 int err; 270 int err;
270 271
271 mntget(newmnt); 272 mntget(newmnt);
272 err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags, mntlist); 273 err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist);
273 switch (err) { 274 switch (err) {
274 case 0: 275 case 0:
275 path_put(&nd->path); 276 path_put(&nd->path);
@@ -371,7 +372,6 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
371 if (IS_ERR(mnt)) 372 if (IS_ERR(mnt))
372 goto out_err; 373 goto out_err;
373 374
374 nd->path.mnt->mnt_flags |= MNT_SHRINKABLE;
375 rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list); 375 rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list);
376 376
377out: 377out:
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 4797787c6a44..246a167cb913 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -18,6 +18,8 @@
18#ifndef _CIFS_FS_SB_H 18#ifndef _CIFS_FS_SB_H
19#define _CIFS_FS_SB_H 19#define _CIFS_FS_SB_H
20 20
21#include <linux/backing-dev.h>
22
21#define CIFS_MOUNT_NO_PERM 1 /* do not do client vfs_perm check */ 23#define CIFS_MOUNT_NO_PERM 1 /* do not do client vfs_perm check */
22#define CIFS_MOUNT_SET_UID 2 /* set current's euid in create etc. */ 24#define CIFS_MOUNT_SET_UID 2 /* set current's euid in create etc. */
23#define CIFS_MOUNT_SERVER_INUM 4 /* inode numbers from uniqueid from server */ 25#define CIFS_MOUNT_SERVER_INUM 4 /* inode numbers from uniqueid from server */
@@ -50,5 +52,6 @@ struct cifs_sb_info {
50#ifdef CONFIG_CIFS_DFS_UPCALL 52#ifdef CONFIG_CIFS_DFS_UPCALL
51 char *mountdata; /* mount options received at mount time */ 53 char *mountdata; /* mount options received at mount time */
52#endif 54#endif
55 struct backing_dev_info bdi;
53}; 56};
54#endif /* _CIFS_FS_SB_H */ 57#endif /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 8ec7736ce954..310d12f69a92 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/slab.h>
23#include <linux/string.h> 24#include <linux/string.h>
24#include <keys/user-type.h> 25#include <keys/user-type.h>
25#include <linux/key-type.h> 26#include <linux/key-type.h>
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 714a542cbafc..d07676bd76d2 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -19,6 +19,7 @@
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/slab.h>
22#include "cifs_unicode.h" 23#include "cifs_unicode.h"
23#include "cifs_uniupr.h" 24#include "cifs_uniupr.h"
24#include "cifspdu.h" 25#include "cifspdu.h"
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7dfe0842a6f6..9b716d044bbd 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/slab.h>
25#include "cifspdu.h" 26#include "cifspdu.h"
26#include "cifsglob.h" 27#include "cifsglob.h"
27#include "cifsacl.h" 28#include "cifsacl.h"
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7efe1745494d..fbe986430d0c 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/slab.h>
23#include "cifspdu.h" 24#include "cifspdu.h"
24#include "cifsglob.h" 25#include "cifsglob.h"
25#include "cifs_debug.h" 26#include "cifs_debug.h"
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 29f1da761bbf..ad235d604a0b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -103,6 +103,12 @@ cifs_read_super(struct super_block *sb, void *data,
103 if (cifs_sb == NULL) 103 if (cifs_sb == NULL)
104 return -ENOMEM; 104 return -ENOMEM;
105 105
106 rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
107 if (rc) {
108 kfree(cifs_sb);
109 return rc;
110 }
111
106#ifdef CONFIG_CIFS_DFS_UPCALL 112#ifdef CONFIG_CIFS_DFS_UPCALL
107 /* copy mount params to sb for use in submounts */ 113 /* copy mount params to sb for use in submounts */
108 /* BB: should we move this after the mount so we 114 /* BB: should we move this after the mount so we
@@ -115,6 +121,7 @@ cifs_read_super(struct super_block *sb, void *data,
115 int len = strlen(data); 121 int len = strlen(data);
116 cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL); 122 cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
117 if (cifs_sb->mountdata == NULL) { 123 if (cifs_sb->mountdata == NULL) {
124 bdi_destroy(&cifs_sb->bdi);
118 kfree(sb->s_fs_info); 125 kfree(sb->s_fs_info);
119 sb->s_fs_info = NULL; 126 sb->s_fs_info = NULL;
120 return -ENOMEM; 127 return -ENOMEM;
@@ -135,6 +142,7 @@ cifs_read_super(struct super_block *sb, void *data,
135 142
136 sb->s_magic = CIFS_MAGIC_NUMBER; 143 sb->s_magic = CIFS_MAGIC_NUMBER;
137 sb->s_op = &cifs_super_ops; 144 sb->s_op = &cifs_super_ops;
145 sb->s_bdi = &cifs_sb->bdi;
138/* if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512) 146/* if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
139 sb->s_blocksize = 147 sb->s_blocksize =
140 cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */ 148 cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
@@ -183,6 +191,7 @@ out_mount_failed:
183 } 191 }
184#endif 192#endif
185 unload_nls(cifs_sb->local_nls); 193 unload_nls(cifs_sb->local_nls);
194 bdi_destroy(&cifs_sb->bdi);
186 kfree(cifs_sb); 195 kfree(cifs_sb);
187 } 196 }
188 return rc; 197 return rc;
@@ -214,6 +223,7 @@ cifs_put_super(struct super_block *sb)
214#endif 223#endif
215 224
216 unload_nls(cifs_sb->local_nls); 225 unload_nls(cifs_sb->local_nls);
226 bdi_destroy(&cifs_sb->bdi);
217 kfree(cifs_sb); 227 kfree(cifs_sb);
218 228
219 unlock_kernel(); 229 unlock_kernel();
@@ -312,6 +322,7 @@ cifs_alloc_inode(struct super_block *sb)
312 cifs_inode->clientCanCacheRead = false; 322 cifs_inode->clientCanCacheRead = false;
313 cifs_inode->clientCanCacheAll = false; 323 cifs_inode->clientCanCacheAll = false;
314 cifs_inode->delete_pending = false; 324 cifs_inode->delete_pending = false;
325 cifs_inode->invalid_mapping = false;
315 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 326 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
316 cifs_inode->server_eof = 0; 327 cifs_inode->server_eof = 0;
317 328
@@ -638,7 +649,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
638 setting the revalidate time to zero */ 649 setting the revalidate time to zero */
639 CIFS_I(file->f_path.dentry->d_inode)->time = 0; 650 CIFS_I(file->f_path.dentry->d_inode)->time = 0;
640 651
641 retval = cifs_revalidate(file->f_path.dentry); 652 retval = cifs_revalidate_file(file);
642 if (retval < 0) 653 if (retval < 0)
643 return (loff_t)retval; 654 return (loff_t)retval;
644 } 655 }
@@ -758,7 +769,7 @@ const struct file_operations cifs_file_ops = {
758}; 769};
759 770
760const struct file_operations cifs_file_direct_ops = { 771const struct file_operations cifs_file_direct_ops = {
761 /* no mmap, no aio, no readv - 772 /* no aio, no readv -
762 BB reevaluate whether they can be done with directio, no cache */ 773 BB reevaluate whether they can be done with directio, no cache */
763 .read = cifs_user_read, 774 .read = cifs_user_read,
764 .write = cifs_user_write, 775 .write = cifs_user_write,
@@ -767,6 +778,7 @@ const struct file_operations cifs_file_direct_ops = {
767 .lock = cifs_lock, 778 .lock = cifs_lock,
768 .fsync = cifs_fsync, 779 .fsync = cifs_fsync,
769 .flush = cifs_flush, 780 .flush = cifs_flush,
781 .mmap = cifs_file_mmap,
770 .splice_read = generic_file_splice_read, 782 .splice_read = generic_file_splice_read,
771#ifdef CONFIG_CIFS_POSIX 783#ifdef CONFIG_CIFS_POSIX
772 .unlocked_ioctl = cifs_ioctl, 784 .unlocked_ioctl = cifs_ioctl,
@@ -806,6 +818,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
806 .release = cifs_close, 818 .release = cifs_close,
807 .fsync = cifs_fsync, 819 .fsync = cifs_fsync,
808 .flush = cifs_flush, 820 .flush = cifs_flush,
821 .mmap = cifs_file_mmap,
809 .splice_read = generic_file_splice_read, 822 .splice_read = generic_file_splice_read,
810#ifdef CONFIG_CIFS_POSIX 823#ifdef CONFIG_CIFS_POSIX
811 .unlocked_ioctl = cifs_ioctl, 824 .unlocked_ioctl = cifs_ioctl,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index ac2b24c192f8..7aa57ecdc437 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,7 +61,8 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int);
61extern int cifs_rmdir(struct inode *, struct dentry *); 61extern int cifs_rmdir(struct inode *, struct dentry *);
62extern int cifs_rename(struct inode *, struct dentry *, struct inode *, 62extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
63 struct dentry *); 63 struct dentry *);
64extern int cifs_revalidate(struct dentry *); 64extern int cifs_revalidate_file(struct file *filp);
65extern int cifs_revalidate_dentry(struct dentry *);
65extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 66extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
66extern int cifs_setattr(struct dentry *, struct iattr *); 67extern int cifs_setattr(struct dentry *, struct iattr *);
67 68
@@ -113,5 +114,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
113extern const struct export_operations cifs_export_ops; 114extern const struct export_operations cifs_export_ops;
114#endif /* EXPERIMENTAL */ 115#endif /* EXPERIMENTAL */
115 116
116#define CIFS_VERSION "1.61" 117#define CIFS_VERSION "1.62"
117#endif /* _CIFSFS_H */ 118#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 4b35f7ec0583..ecf0ffbe2b64 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -18,6 +18,7 @@
18 */ 18 */
19#include <linux/in.h> 19#include <linux/in.h>
20#include <linux/in6.h> 20#include <linux/in6.h>
21#include <linux/slab.h>
21#include <linux/slow-work.h> 22#include <linux/slow-work.h>
22#include "cifs_fs_sb.h" 23#include "cifs_fs_sb.h"
23#include "cifsacl.h" 24#include "cifsacl.h"
@@ -149,6 +150,7 @@ struct TCP_Server_Info {
149 bool svlocal:1; /* local server or remote */ 150 bool svlocal:1; /* local server or remote */
150 bool noblocksnd; /* use blocking sendmsg */ 151 bool noblocksnd; /* use blocking sendmsg */
151 bool noautotune; /* do not autotune send buf sizes */ 152 bool noautotune; /* do not autotune send buf sizes */
153 bool tcp_nodelay;
152 atomic_t inFlight; /* number of requests on the wire to server */ 154 atomic_t inFlight; /* number of requests on the wire to server */
153#ifdef CONFIG_CIFS_STATS2 155#ifdef CONFIG_CIFS_STATS2
154 atomic_t inSend; /* requests trying to send */ 156 atomic_t inSend; /* requests trying to send */
@@ -204,7 +206,7 @@ struct cifsUidInfo {
204struct cifsSesInfo { 206struct cifsSesInfo {
205 struct list_head smb_ses_list; 207 struct list_head smb_ses_list;
206 struct list_head tcon_list; 208 struct list_head tcon_list;
207 struct semaphore sesSem; 209 struct mutex session_mutex;
208#if 0 210#if 0
209 struct cifsUidInfo *uidInfo; /* pointer to user info */ 211 struct cifsUidInfo *uidInfo; /* pointer to user info */
210#endif 212#endif
@@ -388,6 +390,7 @@ struct cifsInodeInfo {
388 bool clientCanCacheRead:1; /* read oplock */ 390 bool clientCanCacheRead:1; /* read oplock */
389 bool clientCanCacheAll:1; /* read and writebehind oplock */ 391 bool clientCanCacheAll:1; /* read and writebehind oplock */
390 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 392 bool delete_pending:1; /* DELETE_ON_CLOSE is set */
393 bool invalid_mapping:1; /* pagecache is invalid */
391 u64 server_eof; /* current file size on server */ 394 u64 server_eof; /* current file size on server */
392 u64 uniqueid; /* server inode number */ 395 u64 uniqueid; /* server inode number */
393 struct inode vfs_inode; 396 struct inode vfs_inode;
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 3877737f96a6..14d036d8db11 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -415,10 +415,10 @@ struct smb_hdr {
415 __u8 WordCount; 415 __u8 WordCount;
416} __attribute__((packed)); 416} __attribute__((packed));
417/* given a pointer to an smb_hdr retrieve the value of byte count */ 417/* given a pointer to an smb_hdr retrieve the value of byte count */
418#define BCC(smb_var) (*(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount))) 418#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
419#define BCC_LE(smb_var) (*(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount))) 419#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
420/* given a pointer to an smb_hdr retrieve the pointer to the byte area */ 420/* given a pointer to an smb_hdr retrieve the pointer to the byte area */
421#define pByteArea(smb_var) ((unsigned char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount) + 2) 421#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2)
422 422
423/* 423/*
424 * Computer Name Length (since Netbios name was length 16 with last byte 0x20) 424 * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5646727e33f5..39e47f46dea5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -104,10 +104,12 @@ extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
104extern struct inode *cifs_iget(struct super_block *sb, 104extern struct inode *cifs_iget(struct super_block *sb,
105 struct cifs_fattr *fattr); 105 struct cifs_fattr *fattr);
106 106
107extern int cifs_get_file_info(struct file *filp);
107extern int cifs_get_inode_info(struct inode **pinode, 108extern int cifs_get_inode_info(struct inode **pinode,
108 const unsigned char *search_path, 109 const unsigned char *search_path,
109 FILE_ALL_INFO *pfile_info, 110 FILE_ALL_INFO *pfile_info,
110 struct super_block *sb, int xid, const __u16 *pfid); 111 struct super_block *sb, int xid, const __u16 *pfid);
112extern int cifs_get_file_info_unix(struct file *filp);
111extern int cifs_get_inode_info_unix(struct inode **pinode, 113extern int cifs_get_inode_info_unix(struct inode **pinode,
112 const unsigned char *search_path, 114 const unsigned char *search_path,
113 struct super_block *sb, int xid); 115 struct super_block *sb, int xid);
@@ -142,6 +144,8 @@ extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
142extern int CIFSFindClose(const int, struct cifsTconInfo *tcon, 144extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
143 const __u16 search_handle); 145 const __u16 search_handle);
144 146
147extern int CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
148 u16 netfid, FILE_ALL_INFO *pFindData);
145extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 149extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
146 const unsigned char *searchName, 150 const unsigned char *searchName,
147 FILE_ALL_INFO *findData, 151 FILE_ALL_INFO *findData,
@@ -152,6 +156,8 @@ extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
152 FILE_ALL_INFO *findData, 156 FILE_ALL_INFO *findData,
153 const struct nls_table *nls_codepage, int remap); 157 const struct nls_table *nls_codepage, int remap);
154 158
159extern int CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
160 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
155extern int CIFSSMBUnixQPathInfo(const int xid, 161extern int CIFSSMBUnixQPathInfo(const int xid,
156 struct cifsTconInfo *tcon, 162 struct cifsTconInfo *tcon,
157 const unsigned char *searchName, 163 const unsigned char *searchName,
@@ -363,13 +369,10 @@ extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
363 __u32 filter, struct file *file, int multishot, 369 __u32 filter, struct file *file, int multishot,
364 const struct nls_table *nls_codepage); 370 const struct nls_table *nls_codepage);
365extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon, 371extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
366 const unsigned char *searchName, char *EAData, 372 const unsigned char *searchName,
373 const unsigned char *ea_name, char *EAData,
367 size_t bufsize, const struct nls_table *nls_codepage, 374 size_t bufsize, const struct nls_table *nls_codepage,
368 int remap_special_chars); 375 int remap_special_chars);
369extern ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon,
370 const unsigned char *searchName, const unsigned char *ea_name,
371 unsigned char *ea_value, size_t buf_size,
372 const struct nls_table *nls_codepage, int remap_special_chars);
373extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, 376extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon,
374 const char *fileName, const char *ea_name, 377 const char *fileName, const char *ea_name,
375 const void *ea_value, const __u16 ea_value_len, 378 const void *ea_value, const __u16 ea_value_len,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 941441d3e386..5d3f29fef532 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -30,6 +30,7 @@
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/kernel.h> 31#include <linux/kernel.h>
32#include <linux/vfs.h> 32#include <linux/vfs.h>
33#include <linux/slab.h>
33#include <linux/posix_acl_xattr.h> 34#include <linux/posix_acl_xattr.h>
34#include <asm/uaccess.h> 35#include <asm/uaccess.h>
35#include "cifspdu.h" 36#include "cifspdu.h"
@@ -170,19 +171,19 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
170 * need to prevent multiple threads trying to simultaneously 171 * need to prevent multiple threads trying to simultaneously
171 * reconnect the same SMB session 172 * reconnect the same SMB session
172 */ 173 */
173 down(&ses->sesSem); 174 mutex_lock(&ses->session_mutex);
174 if (ses->need_reconnect) 175 if (ses->need_reconnect)
175 rc = cifs_setup_session(0, ses, nls_codepage); 176 rc = cifs_setup_session(0, ses, nls_codepage);
176 177
177 /* do we need to reconnect tcon? */ 178 /* do we need to reconnect tcon? */
178 if (rc || !tcon->need_reconnect) { 179 if (rc || !tcon->need_reconnect) {
179 up(&ses->sesSem); 180 mutex_unlock(&ses->session_mutex);
180 goto out; 181 goto out;
181 } 182 }
182 183
183 mark_open_files_invalid(tcon); 184 mark_open_files_invalid(tcon);
184 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); 185 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
185 up(&ses->sesSem); 186 mutex_unlock(&ses->session_mutex);
186 cFYI(1, ("reconnect tcon rc = %d", rc)); 187 cFYI(1, ("reconnect tcon rc = %d", rc));
187 188
188 if (rc) 189 if (rc)
@@ -500,7 +501,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
500 } else if (pSMBr->hdr.WordCount == 13) { 501 } else if (pSMBr->hdr.WordCount == 13) {
501 cERROR(1, ("mount failed, cifs module not built " 502 cERROR(1, ("mount failed, cifs module not built "
502 "with CIFS_WEAK_PW_HASH support")); 503 "with CIFS_WEAK_PW_HASH support"));
503 rc = -EOPNOTSUPP; 504 rc = -EOPNOTSUPP;
504#endif /* WEAK_PW_HASH */ 505#endif /* WEAK_PW_HASH */
505 goto neg_err_exit; 506 goto neg_err_exit;
506 } else if (pSMBr->hdr.WordCount != 17) { 507 } else if (pSMBr->hdr.WordCount != 17) {
@@ -700,13 +701,13 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
700 if (!ses || !ses->server) 701 if (!ses || !ses->server)
701 return -EIO; 702 return -EIO;
702 703
703 down(&ses->sesSem); 704 mutex_lock(&ses->session_mutex);
704 if (ses->need_reconnect) 705 if (ses->need_reconnect)
705 goto session_already_dead; /* no need to send SMBlogoff if uid 706 goto session_already_dead; /* no need to send SMBlogoff if uid
706 already closed due to reconnect */ 707 already closed due to reconnect */
707 rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB); 708 rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB);
708 if (rc) { 709 if (rc) {
709 up(&ses->sesSem); 710 mutex_unlock(&ses->session_mutex);
710 return rc; 711 return rc;
711 } 712 }
712 713
@@ -721,7 +722,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
721 pSMB->AndXCommand = 0xFF; 722 pSMB->AndXCommand = 0xFF;
722 rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0); 723 rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
723session_already_dead: 724session_already_dead:
724 up(&ses->sesSem); 725 mutex_unlock(&ses->session_mutex);
725 726
726 /* if session dead then we do not need to do ulogoff, 727 /* if session dead then we do not need to do ulogoff,
727 since server closed smb session, no sense reporting 728 since server closed smb session, no sense reporting
@@ -1430,6 +1431,8 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
1430 __u32 bytes_sent; 1431 __u32 bytes_sent;
1431 __u16 byte_count; 1432 __u16 byte_count;
1432 1433
1434 *nbytes = 0;
1435
1433 /* cFYI(1, ("write at %lld %d bytes", offset, count));*/ 1436 /* cFYI(1, ("write at %lld %d bytes", offset, count));*/
1434 if (tcon->ses == NULL) 1437 if (tcon->ses == NULL)
1435 return -ECONNABORTED; 1438 return -ECONNABORTED;
@@ -1512,11 +1515,18 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
1512 cifs_stats_inc(&tcon->num_writes); 1515 cifs_stats_inc(&tcon->num_writes);
1513 if (rc) { 1516 if (rc) {
1514 cFYI(1, ("Send error in write = %d", rc)); 1517 cFYI(1, ("Send error in write = %d", rc));
1515 *nbytes = 0;
1516 } else { 1518 } else {
1517 *nbytes = le16_to_cpu(pSMBr->CountHigh); 1519 *nbytes = le16_to_cpu(pSMBr->CountHigh);
1518 *nbytes = (*nbytes) << 16; 1520 *nbytes = (*nbytes) << 16;
1519 *nbytes += le16_to_cpu(pSMBr->Count); 1521 *nbytes += le16_to_cpu(pSMBr->Count);
1522
1523 /*
1524 * Mask off high 16 bits when bytes written as returned by the
1525 * server is greater than bytes requested by the client. Some
1526 * OS/2 servers are known to set incorrect CountHigh values.
1527 */
1528 if (*nbytes > count)
1529 *nbytes &= 0xFFFF;
1520 } 1530 }
1521 1531
1522 cifs_buf_release(pSMB); 1532 cifs_buf_release(pSMB);
@@ -1605,6 +1615,14 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
1605 *nbytes = le16_to_cpu(pSMBr->CountHigh); 1615 *nbytes = le16_to_cpu(pSMBr->CountHigh);
1606 *nbytes = (*nbytes) << 16; 1616 *nbytes = (*nbytes) << 16;
1607 *nbytes += le16_to_cpu(pSMBr->Count); 1617 *nbytes += le16_to_cpu(pSMBr->Count);
1618
1619 /*
1620 * Mask off high 16 bits when bytes written as returned by the
1621 * server is greater than bytes requested by the client. OS/2
1622 * servers are known to set incorrect CountHigh values.
1623 */
1624 if (*nbytes > count)
1625 *nbytes &= 0xFFFF;
1608 } 1626 }
1609 1627
1610/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ 1628/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
@@ -1793,8 +1811,21 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
1793 } 1811 }
1794 parm_data = (struct cifs_posix_lock *) 1812 parm_data = (struct cifs_posix_lock *)
1795 ((char *)&pSMBr->hdr.Protocol + data_offset); 1813 ((char *)&pSMBr->hdr.Protocol + data_offset);
1796 if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK)) 1814 if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK))
1797 pLockData->fl_type = F_UNLCK; 1815 pLockData->fl_type = F_UNLCK;
1816 else {
1817 if (parm_data->lock_type ==
1818 __constant_cpu_to_le16(CIFS_RDLCK))
1819 pLockData->fl_type = F_RDLCK;
1820 else if (parm_data->lock_type ==
1821 __constant_cpu_to_le16(CIFS_WRLCK))
1822 pLockData->fl_type = F_WRLCK;
1823
1824 pLockData->fl_start = parm_data->start;
1825 pLockData->fl_end = parm_data->start +
1826 parm_data->length - 1;
1827 pLockData->fl_pid = parm_data->pid;
1828 }
1798 } 1829 }
1799 1830
1800plk_err_exit: 1831plk_err_exit:
@@ -3230,8 +3261,72 @@ QInfRetry:
3230 return rc; 3261 return rc;
3231} 3262}
3232 3263
3264int
3265CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
3266 u16 netfid, FILE_ALL_INFO *pFindData)
3267{
3268 struct smb_t2_qfi_req *pSMB = NULL;
3269 struct smb_t2_qfi_rsp *pSMBr = NULL;
3270 int rc = 0;
3271 int bytes_returned;
3272 __u16 params, byte_count;
3273
3274QFileInfoRetry:
3275 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3276 (void **) &pSMBr);
3277 if (rc)
3278 return rc;
3279
3280 params = 2 /* level */ + 2 /* fid */;
3281 pSMB->t2.TotalDataCount = 0;
3282 pSMB->t2.MaxParameterCount = cpu_to_le16(4);
3283 /* BB find exact max data count below from sess structure BB */
3284 pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
3285 pSMB->t2.MaxSetupCount = 0;
3286 pSMB->t2.Reserved = 0;
3287 pSMB->t2.Flags = 0;
3288 pSMB->t2.Timeout = 0;
3289 pSMB->t2.Reserved2 = 0;
3290 pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
3291 Fid) - 4);
3292 pSMB->t2.DataCount = 0;
3293 pSMB->t2.DataOffset = 0;
3294 pSMB->t2.SetupCount = 1;
3295 pSMB->t2.Reserved3 = 0;
3296 pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
3297 byte_count = params + 1 /* pad */ ;
3298 pSMB->t2.TotalParameterCount = cpu_to_le16(params);
3299 pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
3300 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
3301 pSMB->Pad = 0;
3302 pSMB->Fid = netfid;
3303 pSMB->hdr.smb_buf_length += byte_count;
3304
3305 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3306 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3307 if (rc) {
3308 cFYI(1, ("Send error in QPathInfo = %d", rc));
3309 } else { /* decode response */
3310 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3233 3311
3312 if (rc) /* BB add auto retry on EOPNOTSUPP? */
3313 rc = -EIO;
3314 else if (pSMBr->ByteCount < 40)
3315 rc = -EIO; /* bad smb */
3316 else if (pFindData) {
3317 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
3318 memcpy((char *) pFindData,
3319 (char *) &pSMBr->hdr.Protocol +
3320 data_offset, sizeof(FILE_ALL_INFO));
3321 } else
3322 rc = -ENOMEM;
3323 }
3324 cifs_buf_release(pSMB);
3325 if (rc == -EAGAIN)
3326 goto QFileInfoRetry;
3234 3327
3328 return rc;
3329}
3235 3330
3236int 3331int
3237CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 3332CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
@@ -3335,6 +3430,75 @@ QPathInfoRetry:
3335} 3430}
3336 3431
3337int 3432int
3433CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
3434 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
3435{
3436 struct smb_t2_qfi_req *pSMB = NULL;
3437 struct smb_t2_qfi_rsp *pSMBr = NULL;
3438 int rc = 0;
3439 int bytes_returned;
3440 __u16 params, byte_count;
3441
3442UnixQFileInfoRetry:
3443 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3444 (void **) &pSMBr);
3445 if (rc)
3446 return rc;
3447
3448 params = 2 /* level */ + 2 /* fid */;
3449 pSMB->t2.TotalDataCount = 0;
3450 pSMB->t2.MaxParameterCount = cpu_to_le16(4);
3451 /* BB find exact max data count below from sess structure BB */
3452 pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
3453 pSMB->t2.MaxSetupCount = 0;
3454 pSMB->t2.Reserved = 0;
3455 pSMB->t2.Flags = 0;
3456 pSMB->t2.Timeout = 0;
3457 pSMB->t2.Reserved2 = 0;
3458 pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
3459 Fid) - 4);
3460 pSMB->t2.DataCount = 0;
3461 pSMB->t2.DataOffset = 0;
3462 pSMB->t2.SetupCount = 1;
3463 pSMB->t2.Reserved3 = 0;
3464 pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
3465 byte_count = params + 1 /* pad */ ;
3466 pSMB->t2.TotalParameterCount = cpu_to_le16(params);
3467 pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
3468 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
3469 pSMB->Pad = 0;
3470 pSMB->Fid = netfid;
3471 pSMB->hdr.smb_buf_length += byte_count;
3472
3473 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3474 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3475 if (rc) {
3476 cFYI(1, ("Send error in QPathInfo = %d", rc));
3477 } else { /* decode response */
3478 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3479
3480 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
3481 cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
3482 "Unix Extensions can be disabled on mount "
3483 "by specifying the nosfu mount option."));
3484 rc = -EIO; /* bad smb */
3485 } else {
3486 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
3487 memcpy((char *) pFindData,
3488 (char *) &pSMBr->hdr.Protocol +
3489 data_offset,
3490 sizeof(FILE_UNIX_BASIC_INFO));
3491 }
3492 }
3493
3494 cifs_buf_release(pSMB);
3495 if (rc == -EAGAIN)
3496 goto UnixQFileInfoRetry;
3497
3498 return rc;
3499}
3500
3501int
3338CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon, 3502CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
3339 const unsigned char *searchName, 3503 const unsigned char *searchName,
3340 FILE_UNIX_BASIC_INFO *pFindData, 3504 FILE_UNIX_BASIC_INFO *pFindData,
@@ -3886,7 +4050,7 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
3886 goto parse_DFS_referrals_exit; 4050 goto parse_DFS_referrals_exit;
3887 } 4051 }
3888 4052
3889 /* collect neccessary data from referrals */ 4053 /* collect necessary data from referrals */
3890 for (i = 0; i < *num_of_nodes; i++) { 4054 for (i = 0; i < *num_of_nodes; i++) {
3891 char *temp; 4055 char *temp;
3892 int max_len; 4056 int max_len;
@@ -5269,22 +5433,34 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
5269 cifs_buf_release(pSMB); 5433 cifs_buf_release(pSMB);
5270 return rc; 5434 return rc;
5271} 5435}
5436
5272#ifdef CONFIG_CIFS_XATTR 5437#ifdef CONFIG_CIFS_XATTR
5438/*
5439 * Do a path-based QUERY_ALL_EAS call and parse the result. This is a common
5440 * function used by listxattr and getxattr type calls. When ea_name is set,
5441 * it looks for that attribute name and stuffs that value into the EAData
5442 * buffer. When ea_name is NULL, it stuffs a list of attribute names into the
5443 * buffer. In both cases, the return value is either the length of the
5444 * resulting data or a negative error code. If EAData is a NULL pointer then
5445 * the data isn't copied to it, but the length is returned.
5446 */
5273ssize_t 5447ssize_t
5274CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon, 5448CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
5275 const unsigned char *searchName, 5449 const unsigned char *searchName, const unsigned char *ea_name,
5276 char *EAData, size_t buf_size, 5450 char *EAData, size_t buf_size,
5277 const struct nls_table *nls_codepage, int remap) 5451 const struct nls_table *nls_codepage, int remap)
5278{ 5452{
5279 /* BB assumes one setup word */ 5453 /* BB assumes one setup word */
5280 TRANSACTION2_QPI_REQ *pSMB = NULL; 5454 TRANSACTION2_QPI_REQ *pSMB = NULL;
5281 TRANSACTION2_QPI_RSP *pSMBr = NULL; 5455 TRANSACTION2_QPI_RSP *pSMBr = NULL;
5282 int rc = 0; 5456 int rc = 0;
5283 int bytes_returned; 5457 int bytes_returned;
5284 int name_len; 5458 int list_len;
5459 struct fealist *ea_response_data;
5285 struct fea *temp_fea; 5460 struct fea *temp_fea;
5286 char *temp_ptr; 5461 char *temp_ptr;
5287 __u16 params, byte_count; 5462 char *end_of_smb;
5463 __u16 params, byte_count, data_offset;
5288 5464
5289 cFYI(1, ("In Query All EAs path %s", searchName)); 5465 cFYI(1, ("In Query All EAs path %s", searchName));
5290QAllEAsRetry: 5466QAllEAsRetry:
@@ -5294,22 +5470,22 @@ QAllEAsRetry:
5294 return rc; 5470 return rc;
5295 5471
5296 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 5472 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
5297 name_len = 5473 list_len =
5298 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, 5474 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
5299 PATH_MAX, nls_codepage, remap); 5475 PATH_MAX, nls_codepage, remap);
5300 name_len++; /* trailing null */ 5476 list_len++; /* trailing null */
5301 name_len *= 2; 5477 list_len *= 2;
5302 } else { /* BB improve the check for buffer overruns BB */ 5478 } else { /* BB improve the check for buffer overruns BB */
5303 name_len = strnlen(searchName, PATH_MAX); 5479 list_len = strnlen(searchName, PATH_MAX);
5304 name_len++; /* trailing null */ 5480 list_len++; /* trailing null */
5305 strncpy(pSMB->FileName, searchName, name_len); 5481 strncpy(pSMB->FileName, searchName, list_len);
5306 } 5482 }
5307 5483
5308 params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; 5484 params = 2 /* level */ + 4 /* reserved */ + list_len /* includes NUL */;
5309 pSMB->TotalDataCount = 0; 5485 pSMB->TotalDataCount = 0;
5310 pSMB->MaxParameterCount = cpu_to_le16(2); 5486 pSMB->MaxParameterCount = cpu_to_le16(2);
5311 /* BB find exact max SMB PDU from sess structure BB */ 5487 /* BB find exact max SMB PDU from sess structure BB */
5312 pSMB->MaxDataCount = cpu_to_le16(4000); 5488 pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
5313 pSMB->MaxSetupCount = 0; 5489 pSMB->MaxSetupCount = 0;
5314 pSMB->Reserved = 0; 5490 pSMB->Reserved = 0;
5315 pSMB->Flags = 0; 5491 pSMB->Flags = 0;
@@ -5334,237 +5510,117 @@ QAllEAsRetry:
5334 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5510 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
5335 if (rc) { 5511 if (rc) {
5336 cFYI(1, ("Send error in QueryAllEAs = %d", rc)); 5512 cFYI(1, ("Send error in QueryAllEAs = %d", rc));
5337 } else { /* decode response */ 5513 goto QAllEAsOut;
5338 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 5514 }
5339 5515
5340 /* BB also check enough total bytes returned */ 5516
5341 /* BB we need to improve the validity checking 5517 /* BB also check enough total bytes returned */
5342 of these trans2 responses */ 5518 /* BB we need to improve the validity checking
5343 if (rc || (pSMBr->ByteCount < 4)) 5519 of these trans2 responses */
5344 rc = -EIO; /* bad smb */ 5520
5345 /* else if (pFindData){ 5521 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
5346 memcpy((char *) pFindData, 5522 if (rc || (pSMBr->ByteCount < 4)) {
5347 (char *) &pSMBr->hdr.Protocol + 5523 rc = -EIO; /* bad smb */
5348 data_offset, kl); 5524 goto QAllEAsOut;
5349 }*/ else {
5350 /* check that length of list is not more than bcc */
5351 /* check that each entry does not go beyond length
5352 of list */
5353 /* check that each element of each entry does not
5354 go beyond end of list */
5355 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
5356 struct fealist *ea_response_data;
5357 rc = 0;
5358 /* validate_trans2_offsets() */
5359 /* BB check if start of smb + data_offset > &bcc+ bcc */
5360 ea_response_data = (struct fealist *)
5361 (((char *) &pSMBr->hdr.Protocol) +
5362 data_offset);
5363 name_len = le32_to_cpu(ea_response_data->list_len);
5364 cFYI(1, ("ea length %d", name_len));
5365 if (name_len <= 8) {
5366 /* returned EA size zeroed at top of function */
5367 cFYI(1, ("empty EA list returned from server"));
5368 } else {
5369 /* account for ea list len */
5370 name_len -= 4;
5371 temp_fea = ea_response_data->list;
5372 temp_ptr = (char *)temp_fea;
5373 while (name_len > 0) {
5374 __u16 value_len;
5375 name_len -= 4;
5376 temp_ptr += 4;
5377 rc += temp_fea->name_len;
5378 /* account for prefix user. and trailing null */
5379 rc = rc + 5 + 1;
5380 if (rc < (int)buf_size) {
5381 memcpy(EAData, "user.", 5);
5382 EAData += 5;
5383 memcpy(EAData, temp_ptr,
5384 temp_fea->name_len);
5385 EAData += temp_fea->name_len;
5386 /* null terminate name */
5387 *EAData = 0;
5388 EAData = EAData + 1;
5389 } else if (buf_size == 0) {
5390 /* skip copy - calc size only */
5391 } else {
5392 /* stop before overrun buffer */
5393 rc = -ERANGE;
5394 break;
5395 }
5396 name_len -= temp_fea->name_len;
5397 temp_ptr += temp_fea->name_len;
5398 /* account for trailing null */
5399 name_len--;
5400 temp_ptr++;
5401 value_len =
5402 le16_to_cpu(temp_fea->value_len);
5403 name_len -= value_len;
5404 temp_ptr += value_len;
5405 /* BB check that temp_ptr is still
5406 within the SMB BB*/
5407
5408 /* no trailing null to account for
5409 in value len */
5410 /* go on to next EA */
5411 temp_fea = (struct fea *)temp_ptr;
5412 }
5413 }
5414 }
5415 } 5525 }
5416 cifs_buf_release(pSMB);
5417 if (rc == -EAGAIN)
5418 goto QAllEAsRetry;
5419 5526
5420 return (ssize_t)rc; 5527 /* check that length of list is not more than bcc */
5421} 5528 /* check that each entry does not go beyond length
5529 of list */
5530 /* check that each element of each entry does not
5531 go beyond end of list */
5532 /* validate_trans2_offsets() */
5533 /* BB check if start of smb + data_offset > &bcc+ bcc */
5422 5534
5423ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon, 5535 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
5424 const unsigned char *searchName, const unsigned char *ea_name, 5536 ea_response_data = (struct fealist *)
5425 unsigned char *ea_value, size_t buf_size, 5537 (((char *) &pSMBr->hdr.Protocol) + data_offset);
5426 const struct nls_table *nls_codepage, int remap)
5427{
5428 TRANSACTION2_QPI_REQ *pSMB = NULL;
5429 TRANSACTION2_QPI_RSP *pSMBr = NULL;
5430 int rc = 0;
5431 int bytes_returned;
5432 int name_len;
5433 struct fea *temp_fea;
5434 char *temp_ptr;
5435 __u16 params, byte_count;
5436 5538
5437 cFYI(1, ("In Query EA path %s", searchName)); 5539 list_len = le32_to_cpu(ea_response_data->list_len);
5438QEARetry: 5540 cFYI(1, ("ea length %d", list_len));
5439 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, 5541 if (list_len <= 8) {
5440 (void **) &pSMBr); 5542 cFYI(1, ("empty EA list returned from server"));
5441 if (rc) 5543 goto QAllEAsOut;
5442 return rc; 5544 }
5443 5545
5444 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 5546 /* make sure list_len doesn't go past end of SMB */
5445 name_len = 5547 end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
5446 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, 5548 if ((char *)ea_response_data + list_len > end_of_smb) {
5447 PATH_MAX, nls_codepage, remap); 5549 cFYI(1, ("EA list appears to go beyond SMB"));
5448 name_len++; /* trailing null */ 5550 rc = -EIO;
5449 name_len *= 2; 5551 goto QAllEAsOut;
5450 } else { /* BB improve the check for buffer overruns BB */
5451 name_len = strnlen(searchName, PATH_MAX);
5452 name_len++; /* trailing null */
5453 strncpy(pSMB->FileName, searchName, name_len);
5454 } 5552 }
5455 5553
5456 params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */; 5554 /* account for ea list len */
5457 pSMB->TotalDataCount = 0; 5555 list_len -= 4;
5458 pSMB->MaxParameterCount = cpu_to_le16(2); 5556 temp_fea = ea_response_data->list;
5459 /* BB find exact max SMB PDU from sess structure BB */ 5557 temp_ptr = (char *)temp_fea;
5460 pSMB->MaxDataCount = cpu_to_le16(4000); 5558 while (list_len > 0) {
5461 pSMB->MaxSetupCount = 0; 5559 unsigned int name_len;
5462 pSMB->Reserved = 0; 5560 __u16 value_len;
5463 pSMB->Flags = 0; 5561
5464 pSMB->Timeout = 0; 5562 list_len -= 4;
5465 pSMB->Reserved2 = 0; 5563 temp_ptr += 4;
5466 pSMB->ParameterOffset = cpu_to_le16(offsetof( 5564 /* make sure we can read name_len and value_len */
5467 struct smb_com_transaction2_qpi_req, InformationLevel) - 4); 5565 if (list_len < 0) {
5468 pSMB->DataCount = 0; 5566 cFYI(1, ("EA entry goes beyond length of list"));
5469 pSMB->DataOffset = 0; 5567 rc = -EIO;
5470 pSMB->SetupCount = 1; 5568 goto QAllEAsOut;
5471 pSMB->Reserved3 = 0; 5569 }
5472 pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
5473 byte_count = params + 1 /* pad */ ;
5474 pSMB->TotalParameterCount = cpu_to_le16(params);
5475 pSMB->ParameterCount = pSMB->TotalParameterCount;
5476 pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS);
5477 pSMB->Reserved4 = 0;
5478 pSMB->hdr.smb_buf_length += byte_count;
5479 pSMB->ByteCount = cpu_to_le16(byte_count);
5480 5570
5481 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 5571 name_len = temp_fea->name_len;
5482 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 5572 value_len = le16_to_cpu(temp_fea->value_len);
5483 if (rc) { 5573 list_len -= name_len + 1 + value_len;
5484 cFYI(1, ("Send error in Query EA = %d", rc)); 5574 if (list_len < 0) {
5485 } else { /* decode response */ 5575 cFYI(1, ("EA entry goes beyond length of list"));
5486 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 5576 rc = -EIO;
5577 goto QAllEAsOut;
5578 }
5487 5579
5488 /* BB also check enough total bytes returned */ 5580 if (ea_name) {
5489 /* BB we need to improve the validity checking 5581 if (strncmp(ea_name, temp_ptr, name_len) == 0) {
5490 of these trans2 responses */ 5582 temp_ptr += name_len + 1;
5491 if (rc || (pSMBr->ByteCount < 4)) 5583 rc = value_len;
5492 rc = -EIO; /* bad smb */ 5584 if (buf_size == 0)
5493 /* else if (pFindData){ 5585 goto QAllEAsOut;
5494 memcpy((char *) pFindData, 5586 if ((size_t)value_len > buf_size) {
5495 (char *) &pSMBr->hdr.Protocol + 5587 rc = -ERANGE;
5496 data_offset, kl); 5588 goto QAllEAsOut;
5497 }*/ else {
5498 /* check that length of list is not more than bcc */
5499 /* check that each entry does not go beyond length
5500 of list */
5501 /* check that each element of each entry does not
5502 go beyond end of list */
5503 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
5504 struct fealist *ea_response_data;
5505 rc = -ENODATA;
5506 /* validate_trans2_offsets() */
5507 /* BB check if start of smb + data_offset > &bcc+ bcc*/
5508 ea_response_data = (struct fealist *)
5509 (((char *) &pSMBr->hdr.Protocol) +
5510 data_offset);
5511 name_len = le32_to_cpu(ea_response_data->list_len);
5512 cFYI(1, ("ea length %d", name_len));
5513 if (name_len <= 8) {
5514 /* returned EA size zeroed at top of function */
5515 cFYI(1, ("empty EA list returned from server"));
5516 } else {
5517 /* account for ea list len */
5518 name_len -= 4;
5519 temp_fea = ea_response_data->list;
5520 temp_ptr = (char *)temp_fea;
5521 /* loop through checking if we have a matching
5522 name and then return the associated value */
5523 while (name_len > 0) {
5524 __u16 value_len;
5525 name_len -= 4;
5526 temp_ptr += 4;
5527 value_len =
5528 le16_to_cpu(temp_fea->value_len);
5529 /* BB validate that value_len falls within SMB,
5530 even though maximum for name_len is 255 */
5531 if (memcmp(temp_fea->name, ea_name,
5532 temp_fea->name_len) == 0) {
5533 /* found a match */
5534 rc = value_len;
5535 /* account for prefix user. and trailing null */
5536 if (rc <= (int)buf_size) {
5537 memcpy(ea_value,
5538 temp_fea->name+temp_fea->name_len+1,
5539 rc);
5540 /* ea values, unlike ea
5541 names, are not null
5542 terminated */
5543 } else if (buf_size == 0) {
5544 /* skip copy - calc size only */
5545 } else {
5546 /* stop before overrun buffer */
5547 rc = -ERANGE;
5548 }
5549 break;
5550 }
5551 name_len -= temp_fea->name_len;
5552 temp_ptr += temp_fea->name_len;
5553 /* account for trailing null */
5554 name_len--;
5555 temp_ptr++;
5556 name_len -= value_len;
5557 temp_ptr += value_len;
5558 /* No trailing null to account for in
5559 value_len. Go on to next EA */
5560 temp_fea = (struct fea *)temp_ptr;
5561 } 5589 }
5590 memcpy(EAData, temp_ptr, value_len);
5591 goto QAllEAsOut;
5592 }
5593 } else {
5594 /* account for prefix user. and trailing null */
5595 rc += (5 + 1 + name_len);
5596 if (rc < (int) buf_size) {
5597 memcpy(EAData, "user.", 5);
5598 EAData += 5;
5599 memcpy(EAData, temp_ptr, name_len);
5600 EAData += name_len;
5601 /* null terminate name */
5602 *EAData = 0;
5603 ++EAData;
5604 } else if (buf_size == 0) {
5605 /* skip copy - calc size only */
5606 } else {
5607 /* stop before overrun buffer */
5608 rc = -ERANGE;
5609 break;
5562 } 5610 }
5563 } 5611 }
5612 temp_ptr += name_len + 1 + value_len;
5613 temp_fea = (struct fea *)temp_ptr;
5564 } 5614 }
5615
5616 /* didn't find the named attribute */
5617 if (ea_name)
5618 rc = -ENODATA;
5619
5620QAllEAsOut:
5565 cifs_buf_release(pSMB); 5621 cifs_buf_release(pSMB);
5566 if (rc == -EAGAIN) 5622 if (rc == -EAGAIN)
5567 goto QEARetry; 5623 goto QAllEAsRetry;
5568 5624
5569 return (ssize_t)rc; 5625 return (ssize_t)rc;
5570} 5626}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 63ea83ff687f..d9566bf8f917 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,6 +23,7 @@
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/wait.h> 25#include <linux/wait.h>
26#include <linux/slab.h>
26#include <linux/pagemap.h> 27#include <linux/pagemap.h>
27#include <linux/ctype.h> 28#include <linux/ctype.h>
28#include <linux/utsname.h> 29#include <linux/utsname.h>
@@ -98,7 +99,7 @@ struct smb_vol {
98 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ 99 bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
99 unsigned int rsize; 100 unsigned int rsize;
100 unsigned int wsize; 101 unsigned int wsize;
101 unsigned int sockopt; 102 bool sockopt_tcp_nodelay:1;
102 unsigned short int port; 103 unsigned short int port;
103 char *prepath; 104 char *prepath;
104}; 105};
@@ -1142,9 +1143,11 @@ cifs_parse_mount_options(char *options, const char *devname,
1142 simple_strtoul(value, &value, 0); 1143 simple_strtoul(value, &value, 0);
1143 } 1144 }
1144 } else if (strnicmp(data, "sockopt", 5) == 0) { 1145 } else if (strnicmp(data, "sockopt", 5) == 0) {
1145 if (value && *value) { 1146 if (!value || !*value) {
1146 vol->sockopt = 1147 cERROR(1, ("no socket option specified"));
1147 simple_strtoul(value, &value, 0); 1148 continue;
1149 } else if (strnicmp(value, "TCP_NODELAY", 11) == 0) {
1150 vol->sockopt_tcp_nodelay = 1;
1148 } 1151 }
1149 } else if (strnicmp(data, "netbiosname", 4) == 0) { 1152 } else if (strnicmp(data, "netbiosname", 4) == 0) {
1150 if (!value || !*value || (*value == ' ')) { 1153 if (!value || !*value || (*value == ' ')) {
@@ -1514,6 +1517,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1514 1517
1515 tcp_ses->noblocksnd = volume_info->noblocksnd; 1518 tcp_ses->noblocksnd = volume_info->noblocksnd;
1516 tcp_ses->noautotune = volume_info->noautotune; 1519 tcp_ses->noautotune = volume_info->noautotune;
1520 tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay;
1517 atomic_set(&tcp_ses->inFlight, 0); 1521 atomic_set(&tcp_ses->inFlight, 0);
1518 init_waitqueue_head(&tcp_ses->response_q); 1522 init_waitqueue_head(&tcp_ses->response_q);
1519 init_waitqueue_head(&tcp_ses->request_q); 1523 init_waitqueue_head(&tcp_ses->request_q);
@@ -1764,6 +1768,7 @@ static int
1764ipv4_connect(struct TCP_Server_Info *server) 1768ipv4_connect(struct TCP_Server_Info *server)
1765{ 1769{
1766 int rc = 0; 1770 int rc = 0;
1771 int val;
1767 bool connected = false; 1772 bool connected = false;
1768 __be16 orig_port = 0; 1773 __be16 orig_port = 0;
1769 struct socket *socket = server->ssocket; 1774 struct socket *socket = server->ssocket;
@@ -1845,6 +1850,14 @@ ipv4_connect(struct TCP_Server_Info *server)
1845 socket->sk->sk_rcvbuf = 140 * 1024; 1850 socket->sk->sk_rcvbuf = 140 * 1024;
1846 } 1851 }
1847 1852
1853 if (server->tcp_nodelay) {
1854 val = 1;
1855 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
1856 (char *)&val, sizeof(val));
1857 if (rc)
1858 cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
1859 }
1860
1848 cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx", 1861 cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
1849 socket->sk->sk_sndbuf, 1862 socket->sk->sk_sndbuf,
1850 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo)); 1863 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo));
@@ -1916,6 +1929,7 @@ static int
1916ipv6_connect(struct TCP_Server_Info *server) 1929ipv6_connect(struct TCP_Server_Info *server)
1917{ 1930{
1918 int rc = 0; 1931 int rc = 0;
1932 int val;
1919 bool connected = false; 1933 bool connected = false;
1920 __be16 orig_port = 0; 1934 __be16 orig_port = 0;
1921 struct socket *socket = server->ssocket; 1935 struct socket *socket = server->ssocket;
@@ -1987,6 +2001,15 @@ ipv6_connect(struct TCP_Server_Info *server)
1987 */ 2001 */
1988 socket->sk->sk_rcvtimeo = 7 * HZ; 2002 socket->sk->sk_rcvtimeo = 7 * HZ;
1989 socket->sk->sk_sndtimeo = 5 * HZ; 2003 socket->sk->sk_sndtimeo = 5 * HZ;
2004
2005 if (server->tcp_nodelay) {
2006 val = 1;
2007 rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
2008 (char *)&val, sizeof(val));
2009 if (rc)
2010 cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
2011 }
2012
1990 server->ssocket = socket; 2013 server->ssocket = socket;
1991 2014
1992 return rc; 2015 return rc;
@@ -2287,12 +2310,12 @@ int
2287cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, 2310cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2288 char *mount_data_global, const char *devname) 2311 char *mount_data_global, const char *devname)
2289{ 2312{
2290 int rc = 0; 2313 int rc;
2291 int xid; 2314 int xid;
2292 struct smb_vol *volume_info; 2315 struct smb_vol *volume_info;
2293 struct cifsSesInfo *pSesInfo = NULL; 2316 struct cifsSesInfo *pSesInfo;
2294 struct cifsTconInfo *tcon = NULL; 2317 struct cifsTconInfo *tcon;
2295 struct TCP_Server_Info *srvTcp = NULL; 2318 struct TCP_Server_Info *srvTcp;
2296 char *full_path; 2319 char *full_path;
2297 char *mount_data = mount_data_global; 2320 char *mount_data = mount_data_global;
2298#ifdef CONFIG_CIFS_DFS_UPCALL 2321#ifdef CONFIG_CIFS_DFS_UPCALL
@@ -2301,6 +2324,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2301 int referral_walks_count = 0; 2324 int referral_walks_count = 0;
2302try_mount_again: 2325try_mount_again:
2303#endif 2326#endif
2327 rc = 0;
2328 tcon = NULL;
2329 pSesInfo = NULL;
2330 srvTcp = NULL;
2304 full_path = NULL; 2331 full_path = NULL;
2305 2332
2306 xid = GetXid(); 2333 xid = GetXid();
@@ -2362,13 +2389,13 @@ try_mount_again:
2362 */ 2389 */
2363 cifs_put_tcp_session(srvTcp); 2390 cifs_put_tcp_session(srvTcp);
2364 2391
2365 down(&pSesInfo->sesSem); 2392 mutex_lock(&pSesInfo->session_mutex);
2366 if (pSesInfo->need_reconnect) { 2393 if (pSesInfo->need_reconnect) {
2367 cFYI(1, ("Session needs reconnect")); 2394 cFYI(1, ("Session needs reconnect"));
2368 rc = cifs_setup_session(xid, pSesInfo, 2395 rc = cifs_setup_session(xid, pSesInfo,
2369 cifs_sb->local_nls); 2396 cifs_sb->local_nls);
2370 } 2397 }
2371 up(&pSesInfo->sesSem); 2398 mutex_unlock(&pSesInfo->session_mutex);
2372 } else if (!rc) { 2399 } else if (!rc) {
2373 cFYI(1, ("Existing smb sess not found")); 2400 cFYI(1, ("Existing smb sess not found"));
2374 pSesInfo = sesInfoAlloc(); 2401 pSesInfo = sesInfoAlloc();
@@ -2411,12 +2438,12 @@ try_mount_again:
2411 } 2438 }
2412 pSesInfo->linux_uid = volume_info->linux_uid; 2439 pSesInfo->linux_uid = volume_info->linux_uid;
2413 pSesInfo->overrideSecFlg = volume_info->secFlg; 2440 pSesInfo->overrideSecFlg = volume_info->secFlg;
2414 down(&pSesInfo->sesSem); 2441 mutex_lock(&pSesInfo->session_mutex);
2415 2442
2416 /* BB FIXME need to pass vol->secFlgs BB */ 2443 /* BB FIXME need to pass vol->secFlgs BB */
2417 rc = cifs_setup_session(xid, pSesInfo, 2444 rc = cifs_setup_session(xid, pSesInfo,
2418 cifs_sb->local_nls); 2445 cifs_sb->local_nls);
2419 up(&pSesInfo->sesSem); 2446 mutex_unlock(&pSesInfo->session_mutex);
2420 } 2447 }
2421 2448
2422 /* search for existing tcon to this server share */ 2449 /* search for existing tcon to this server share */
@@ -2597,6 +2624,7 @@ remote_path_check:
2597 2624
2598 cleanup_volume_info(&volume_info); 2625 cleanup_volume_info(&volume_info);
2599 referral_walks_count++; 2626 referral_walks_count++;
2627 FreeXid(xid);
2600 goto try_mount_again; 2628 goto try_mount_again;
2601 } 2629 }
2602#else /* No DFS support, return error on mount */ 2630#else /* No DFS support, return error on mount */
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 6ccf7262d1b7..e9f7ecc2714b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -739,7 +739,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
739 int isValid = 1; 739 int isValid = 1;
740 740
741 if (direntry->d_inode) { 741 if (direntry->d_inode) {
742 if (cifs_revalidate(direntry)) 742 if (cifs_revalidate_dentry(direntry))
743 return 0; 743 return 0;
744 } else { 744 } else {
745 cFYI(1, ("neg dentry 0x%p name = %s", 745 cFYI(1, ("neg dentry 0x%p name = %s",
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 87948147d7ec..6f8a0e3fb25b 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -23,6 +23,7 @@
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */ 24 */
25 25
26#include <linux/slab.h>
26#include <keys/user-type.h> 27#include <keys/user-type.h>
27#include "dns_resolve.h" 28#include "dns_resolve.h"
28#include "cifsglob.h" 29#include "cifsglob.h"
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 057e1dae12ab..9b11a8f56f3a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -31,6 +31,7 @@
31#include <linux/task_io_accounting_ops.h> 31#include <linux/task_io_accounting_ops.h>
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/mount.h> 33#include <linux/mount.h>
34#include <linux/slab.h>
34#include <asm/div64.h> 35#include <asm/div64.h>
35#include "cifsfs.h" 36#include "cifsfs.h"
36#include "cifspdu.h" 37#include "cifspdu.h"
@@ -219,8 +220,8 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
219 cFYI(1, ("inode unchanged on server")); 220 cFYI(1, ("inode unchanged on server"));
220 } else { 221 } else {
221 if (file->f_path.dentry->d_inode->i_mapping) { 222 if (file->f_path.dentry->d_inode->i_mapping) {
222 /* BB no need to lock inode until after invalidate 223 /* BB no need to lock inode until after invalidate
223 since namei code should already have it locked? */ 224 since namei code should already have it locked? */
224 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping); 225 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
225 if (rc != 0) 226 if (rc != 0)
226 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc; 227 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
@@ -838,8 +839,32 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
838 839
839 } else { 840 } else {
840 /* if rc == ERR_SHARING_VIOLATION ? */ 841 /* if rc == ERR_SHARING_VIOLATION ? */
841 rc = 0; /* do not change lock type to unlock 842 rc = 0;
842 since range in use */ 843
844 if (lockType & LOCKING_ANDX_SHARED_LOCK) {
845 pfLock->fl_type = F_WRLCK;
846 } else {
847 rc = CIFSSMBLock(xid, tcon, netfid, length,
848 pfLock->fl_start, 0, 1,
849 lockType | LOCKING_ANDX_SHARED_LOCK,
850 0 /* wait flag */);
851 if (rc == 0) {
852 rc = CIFSSMBLock(xid, tcon, netfid,
853 length, pfLock->fl_start, 1, 0,
854 lockType |
855 LOCKING_ANDX_SHARED_LOCK,
856 0 /* wait flag */);
857 pfLock->fl_type = F_RDLCK;
858 if (rc != 0)
859 cERROR(1, ("Error unlocking "
860 "previously locked range %d "
861 "during test of lock", rc));
862 rc = 0;
863 } else {
864 pfLock->fl_type = F_WRLCK;
865 rc = 0;
866 }
867 }
843 } 868 }
844 869
845 FreeXid(xid); 870 FreeXid(xid);
@@ -1890,11 +1915,10 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1890 1915
1891int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) 1916int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1892{ 1917{
1893 struct dentry *dentry = file->f_path.dentry;
1894 int rc, xid; 1918 int rc, xid;
1895 1919
1896 xid = GetXid(); 1920 xid = GetXid();
1897 rc = cifs_revalidate(dentry); 1921 rc = cifs_revalidate_file(file);
1898 if (rc) { 1922 if (rc) {
1899 cFYI(1, ("Validation prior to mmap failed, error=%d", rc)); 1923 cFYI(1, ("Validation prior to mmap failed, error=%d", rc));
1900 FreeXid(xid); 1924 FreeXid(xid);
@@ -2289,9 +2313,9 @@ cifs_oplock_break(struct slow_work *work)
2289 if (inode && S_ISREG(inode->i_mode)) { 2313 if (inode && S_ISREG(inode->i_mode)) {
2290#ifdef CONFIG_CIFS_EXPERIMENTAL 2314#ifdef CONFIG_CIFS_EXPERIMENTAL
2291 if (cinode->clientCanCacheAll == 0) 2315 if (cinode->clientCanCacheAll == 0)
2292 break_lease(inode, FMODE_READ); 2316 break_lease(inode, O_RDONLY);
2293 else if (cinode->clientCanCacheRead == 0) 2317 else if (cinode->clientCanCacheRead == 0)
2294 break_lease(inode, FMODE_WRITE); 2318 break_lease(inode, O_WRONLY);
2295#endif 2319#endif
2296 rc = filemap_fdatawrite(inode->i_mapping); 2320 rc = filemap_fdatawrite(inode->i_mapping);
2297 if (cinode->clientCanCacheRead == 0) { 2321 if (cinode->clientCanCacheRead == 0) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index cf18ee765590..35ec11716213 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -20,6 +20,7 @@
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/stat.h> 22#include <linux/stat.h>
23#include <linux/slab.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <asm/div64.h> 25#include <asm/div64.h>
25#include "cifsfs.h" 26#include "cifsfs.h"
@@ -77,6 +78,41 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
77 } 78 }
78} 79}
79 80
81/* check inode attributes against fattr. If they don't match, tag the
82 * inode for cache invalidation
83 */
84static void
85cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
86{
87 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
88
89 cFYI(1, ("%s: revalidating inode %llu", __func__, cifs_i->uniqueid));
90
91 if (inode->i_state & I_NEW) {
92 cFYI(1, ("%s: inode %llu is new", __func__, cifs_i->uniqueid));
93 return;
94 }
95
96 /* don't bother with revalidation if we have an oplock */
97 if (cifs_i->clientCanCacheRead) {
98 cFYI(1, ("%s: inode %llu is oplocked", __func__,
99 cifs_i->uniqueid));
100 return;
101 }
102
103 /* revalidate if mtime or size have changed */
104 if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
105 cifs_i->server_eof == fattr->cf_eof) {
106 cFYI(1, ("%s: inode %llu is unchanged", __func__,
107 cifs_i->uniqueid));
108 return;
109 }
110
111 cFYI(1, ("%s: invalidating inode %llu mapping", __func__,
112 cifs_i->uniqueid));
113 cifs_i->invalid_mapping = true;
114}
115
80/* populate an inode with info from a cifs_fattr struct */ 116/* populate an inode with info from a cifs_fattr struct */
81void 117void
82cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) 118cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
@@ -85,6 +121,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
85 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 121 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
86 unsigned long oldtime = cifs_i->time; 122 unsigned long oldtime = cifs_i->time;
87 123
124 cifs_revalidate_cache(inode, fattr);
125
88 inode->i_atime = fattr->cf_atime; 126 inode->i_atime = fattr->cf_atime;
89 inode->i_mtime = fattr->cf_mtime; 127 inode->i_mtime = fattr->cf_mtime;
90 inode->i_ctime = fattr->cf_ctime; 128 inode->i_ctime = fattr->cf_ctime;
@@ -111,6 +149,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
111 149
112 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING; 150 cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
113 151
152 cifs_i->server_eof = fattr->cf_eof;
114 /* 153 /*
115 * Can't safely change the file size here if the client is writing to 154 * Can't safely change the file size here if the client is writing to
116 * it due to potential races. 155 * it due to potential races.
@@ -230,6 +269,31 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
230 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL; 269 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
231} 270}
232 271
272int cifs_get_file_info_unix(struct file *filp)
273{
274 int rc;
275 int xid;
276 FILE_UNIX_BASIC_INFO find_data;
277 struct cifs_fattr fattr;
278 struct inode *inode = filp->f_path.dentry->d_inode;
279 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
280 struct cifsTconInfo *tcon = cifs_sb->tcon;
281 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
282
283 xid = GetXid();
284 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
285 if (!rc) {
286 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
287 } else if (rc == -EREMOTE) {
288 cifs_create_dfs_fattr(&fattr, inode->i_sb);
289 rc = 0;
290 }
291
292 cifs_fattr_to_inode(inode, &fattr);
293 FreeXid(xid);
294 return rc;
295}
296
233int cifs_get_inode_info_unix(struct inode **pinode, 297int cifs_get_inode_info_unix(struct inode **pinode,
234 const unsigned char *full_path, 298 const unsigned char *full_path,
235 struct super_block *sb, int xid) 299 struct super_block *sb, int xid)
@@ -366,7 +430,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
366 char ea_value[4]; 430 char ea_value[4];
367 __u32 mode; 431 __u32 mode;
368 432
369 rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS", 433 rc = CIFSSMBQAllEAs(xid, cifs_sb->tcon, path, "SETFILEBITS",
370 ea_value, 4 /* size of buf */, cifs_sb->local_nls, 434 ea_value, 4 /* size of buf */, cifs_sb->local_nls,
371 cifs_sb->mnt_cifs_flags & 435 cifs_sb->mnt_cifs_flags &
372 CIFS_MOUNT_MAP_SPECIAL_CHR); 436 CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -431,6 +495,47 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
431 fattr->cf_gid = cifs_sb->mnt_gid; 495 fattr->cf_gid = cifs_sb->mnt_gid;
432} 496}
433 497
498int cifs_get_file_info(struct file *filp)
499{
500 int rc;
501 int xid;
502 FILE_ALL_INFO find_data;
503 struct cifs_fattr fattr;
504 struct inode *inode = filp->f_path.dentry->d_inode;
505 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
506 struct cifsTconInfo *tcon = cifs_sb->tcon;
507 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
508
509 xid = GetXid();
510 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
511 if (rc == -EOPNOTSUPP || rc == -EINVAL) {
512 /*
513 * FIXME: legacy server -- fall back to path-based call?
514 * for now, just skip revalidating and mark inode for
515 * immediate reval.
516 */
517 rc = 0;
518 CIFS_I(inode)->time = 0;
519 goto cgfi_exit;
520 } else if (rc == -EREMOTE) {
521 cifs_create_dfs_fattr(&fattr, inode->i_sb);
522 rc = 0;
523 } else if (rc)
524 goto cgfi_exit;
525
526 /*
527 * don't bother with SFU junk here -- just mark inode as needing
528 * revalidation.
529 */
530 cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
531 fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
532 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
533 cifs_fattr_to_inode(inode, &fattr);
534cgfi_exit:
535 FreeXid(xid);
536 return rc;
537}
538
434int cifs_get_inode_info(struct inode **pinode, 539int cifs_get_inode_info(struct inode **pinode,
435 const unsigned char *full_path, FILE_ALL_INFO *pfindData, 540 const unsigned char *full_path, FILE_ALL_INFO *pfindData,
436 struct super_block *sb, int xid, const __u16 *pfid) 541 struct super_block *sb, int xid, const __u16 *pfid)
@@ -1388,135 +1493,103 @@ cifs_rename_exit:
1388 return rc; 1493 return rc;
1389} 1494}
1390 1495
1391int cifs_revalidate(struct dentry *direntry) 1496static bool
1497cifs_inode_needs_reval(struct inode *inode)
1392{ 1498{
1393 int xid; 1499 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1394 int rc = 0, wbrc = 0;
1395 char *full_path;
1396 struct cifs_sb_info *cifs_sb;
1397 struct cifsInodeInfo *cifsInode;
1398 loff_t local_size;
1399 struct timespec local_mtime;
1400 bool invalidate_inode = false;
1401 1500
1402 if (direntry->d_inode == NULL) 1501 if (cifs_i->clientCanCacheRead)
1403 return -ENOENT; 1502 return false;
1404 1503
1405 cifsInode = CIFS_I(direntry->d_inode); 1504 if (!lookupCacheEnabled)
1505 return true;
1406 1506
1407 if (cifsInode == NULL) 1507 if (cifs_i->time == 0)
1408 return -ENOENT; 1508 return true;
1409 1509
1410 /* no sense revalidating inode info on file that no one can write */ 1510 /* FIXME: the actimeo should be tunable */
1411 if (CIFS_I(direntry->d_inode)->clientCanCacheRead) 1511 if (time_after_eq(jiffies, cifs_i->time + HZ))
1412 return rc; 1512 return true;
1513
1514 return false;
1515}
1516
1517/* check invalid_mapping flag and zap the cache if it's set */
1518static void
1519cifs_invalidate_mapping(struct inode *inode)
1520{
1521 int rc;
1522 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1523
1524 cifs_i->invalid_mapping = false;
1525
1526 /* write back any cached data */
1527 if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
1528 rc = filemap_write_and_wait(inode->i_mapping);
1529 if (rc)
1530 cifs_i->write_behind_rc = rc;
1531 }
1532 invalidate_remote_inode(inode);
1533}
1534
1535int cifs_revalidate_file(struct file *filp)
1536{
1537 int rc = 0;
1538 struct inode *inode = filp->f_path.dentry->d_inode;
1539
1540 if (!cifs_inode_needs_reval(inode))
1541 goto check_inval;
1542
1543 if (CIFS_SB(inode->i_sb)->tcon->unix_ext)
1544 rc = cifs_get_file_info_unix(filp);
1545 else
1546 rc = cifs_get_file_info(filp);
1547
1548check_inval:
1549 if (CIFS_I(inode)->invalid_mapping)
1550 cifs_invalidate_mapping(inode);
1551
1552 return rc;
1553}
1554
1555/* revalidate a dentry's inode attributes */
1556int cifs_revalidate_dentry(struct dentry *dentry)
1557{
1558 int xid;
1559 int rc = 0;
1560 char *full_path = NULL;
1561 struct inode *inode = dentry->d_inode;
1562 struct super_block *sb = dentry->d_sb;
1563
1564 if (inode == NULL)
1565 return -ENOENT;
1413 1566
1414 xid = GetXid(); 1567 xid = GetXid();
1415 1568
1416 cifs_sb = CIFS_SB(direntry->d_sb); 1569 if (!cifs_inode_needs_reval(inode))
1570 goto check_inval;
1417 1571
1418 /* can not safely grab the rename sem here if rename calls revalidate 1572 /* can not safely grab the rename sem here if rename calls revalidate
1419 since that would deadlock */ 1573 since that would deadlock */
1420 full_path = build_path_from_dentry(direntry); 1574 full_path = build_path_from_dentry(dentry);
1421 if (full_path == NULL) { 1575 if (full_path == NULL) {
1422 rc = -ENOMEM; 1576 rc = -ENOMEM;
1423 FreeXid(xid); 1577 goto check_inval;
1424 return rc;
1425 }
1426 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1427 "jiffies %ld", full_path, direntry->d_inode,
1428 direntry->d_inode->i_count.counter, direntry,
1429 direntry->d_time, jiffies));
1430
1431 if (cifsInode->time == 0) {
1432 /* was set to zero previously to force revalidate */
1433 } else if (time_before(jiffies, cifsInode->time + HZ) &&
1434 lookupCacheEnabled) {
1435 if ((S_ISREG(direntry->d_inode->i_mode) == 0) ||
1436 (direntry->d_inode->i_nlink == 1)) {
1437 kfree(full_path);
1438 FreeXid(xid);
1439 return rc;
1440 } else {
1441 cFYI(1, ("Have to revalidate file due to hardlinks"));
1442 }
1443 }
1444
1445 /* save mtime and size */
1446 local_mtime = direntry->d_inode->i_mtime;
1447 local_size = direntry->d_inode->i_size;
1448
1449 if (cifs_sb->tcon->unix_ext) {
1450 rc = cifs_get_inode_info_unix(&direntry->d_inode, full_path,
1451 direntry->d_sb, xid);
1452 if (rc) {
1453 cFYI(1, ("error on getting revalidate info %d", rc));
1454/* if (rc != -ENOENT)
1455 rc = 0; */ /* BB should we cache info on
1456 certain errors? */
1457 }
1458 } else {
1459 rc = cifs_get_inode_info(&direntry->d_inode, full_path, NULL,
1460 direntry->d_sb, xid, NULL);
1461 if (rc) {
1462 cFYI(1, ("error on getting revalidate info %d", rc));
1463/* if (rc != -ENOENT)
1464 rc = 0; */ /* BB should we cache info on
1465 certain errors? */
1466 }
1467 } 1578 }
1468 /* should we remap certain errors, access denied?, to zero */
1469 1579
1470 /* if not oplocked, we invalidate inode pages if mtime or file size 1580 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1471 had changed on server */ 1581 "jiffies %ld", full_path, inode, inode->i_count.counter,
1582 dentry, dentry->d_time, jiffies));
1472 1583
1473 if (timespec_equal(&local_mtime, &direntry->d_inode->i_mtime) && 1584 if (CIFS_SB(sb)->tcon->unix_ext)
1474 (local_size == direntry->d_inode->i_size)) { 1585 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
1475 cFYI(1, ("cifs_revalidate - inode unchanged")); 1586 else
1476 } else { 1587 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
1477 /* file may have changed on server */ 1588 xid, NULL);
1478 if (cifsInode->clientCanCacheRead) {
1479 /* no need to invalidate inode pages since we were the
1480 only ones who could have modified the file and the
1481 server copy is staler than ours */
1482 } else {
1483 invalidate_inode = true;
1484 }
1485 }
1486 1589
1487 /* can not grab this sem since kernel filesys locking documentation 1590check_inval:
1488 indicates i_mutex may be taken by the kernel on lookup and rename 1591 if (CIFS_I(inode)->invalid_mapping)
1489 which could deadlock if we grab the i_mutex here as well */ 1592 cifs_invalidate_mapping(inode);
1490/* mutex_lock(&direntry->d_inode->i_mutex);*/
1491 /* need to write out dirty pages here */
1492 if (direntry->d_inode->i_mapping) {
1493 /* do we need to lock inode until after invalidate completes
1494 below? */
1495 wbrc = filemap_fdatawrite(direntry->d_inode->i_mapping);
1496 if (wbrc)
1497 CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
1498 }
1499 if (invalidate_inode) {
1500 /* shrink_dcache not necessary now that cifs dentry ops
1501 are exported for negative dentries */
1502/* if (S_ISDIR(direntry->d_inode->i_mode))
1503 shrink_dcache_parent(direntry); */
1504 if (S_ISREG(direntry->d_inode->i_mode)) {
1505 if (direntry->d_inode->i_mapping) {
1506 wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
1507 if (wbrc)
1508 CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
1509 }
1510 /* may eventually have to do this for open files too */
1511 if (list_empty(&(cifsInode->openFileList))) {
1512 /* changed on server - flush read ahead pages */
1513 cFYI(1, ("Invalidating read ahead data on "
1514 "closed file"));
1515 invalidate_remote_inode(direntry->d_inode);
1516 }
1517 }
1518 }
1519/* mutex_unlock(&direntry->d_inode->i_mutex); */
1520 1593
1521 kfree(full_path); 1594 kfree(full_path);
1522 FreeXid(xid); 1595 FreeXid(xid);
@@ -1526,7 +1599,7 @@ int cifs_revalidate(struct dentry *direntry)
1526int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1599int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1527 struct kstat *stat) 1600 struct kstat *stat)
1528{ 1601{
1529 int err = cifs_revalidate(dentry); 1602 int err = cifs_revalidate_dentry(dentry);
1530 if (!err) { 1603 if (!err) {
1531 generic_fillattr(dentry->d_inode, stat); 1604 generic_fillattr(dentry->d_inode, stat);
1532 stat->blksize = CIFS_MAX_MSGSIZE; 1605 stat->blksize = CIFS_MAX_MSGSIZE;
@@ -1762,8 +1835,18 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1762 CIFS_MOUNT_MAP_SPECIAL_CHR); 1835 CIFS_MOUNT_MAP_SPECIAL_CHR);
1763 } 1836 }
1764 1837
1765 if (!rc) 1838 if (!rc) {
1766 rc = inode_setattr(inode, attrs); 1839 rc = inode_setattr(inode, attrs);
1840
1841 /* force revalidate when any of these times are set since some
1842 of the fs types (eg ext3, fat) do not have fine enough
1843 time granularity to match protocol, and we do not have a
1844 a way (yet) to query the server fs's time granularity (and
1845 whether it rounds times down).
1846 */
1847 if (!rc && (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME)))
1848 cifsInode->time = 0;
1849 }
1767out: 1850out:
1768 kfree(args); 1851 kfree(args);
1769 kfree(full_path); 1852 kfree(full_path);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index fc1e0487eaee..c1a9d4236a8c 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -20,6 +20,7 @@
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/stat.h> 22#include <linux/stat.h>
23#include <linux/slab.h>
23#include <linux/namei.h> 24#include <linux/namei.h>
24#include "cifsfs.h" 25#include "cifsfs.h"
25#include "cifspdu.h" 26#include "cifspdu.h"
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d27d4ec6579b..d1474996a812 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -79,7 +79,7 @@ sesInfoAlloc(void)
79 ++ret_buf->ses_count; 79 ++ret_buf->ses_count;
80 INIT_LIST_HEAD(&ret_buf->smb_ses_list); 80 INIT_LIST_HEAD(&ret_buf->smb_ses_list);
81 INIT_LIST_HEAD(&ret_buf->tcon_list); 81 INIT_LIST_HEAD(&ret_buf->tcon_list);
82 init_MUTEX(&ret_buf->sesSem); 82 mutex_init(&ret_buf->session_mutex);
83 } 83 }
84 return ret_buf; 84 return ret_buf;
85} 85}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f84062f9a985..18e0bc1fb593 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -22,6 +22,7 @@
22 */ 22 */
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/pagemap.h> 24#include <linux/pagemap.h>
25#include <linux/slab.h>
25#include <linux/stat.h> 26#include <linux/stat.h>
26#include "cifspdu.h" 27#include "cifspdu.h"
27#include "cifsglob.h" 28#include "cifsglob.h"
@@ -77,6 +78,11 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
77 78
78 cFYI(1, ("For %s", name->name)); 79 cFYI(1, ("For %s", name->name));
79 80
81 if (parent->d_op && parent->d_op->d_hash)
82 parent->d_op->d_hash(parent, name);
83 else
84 name->hash = full_name_hash(name->name, name->len);
85
80 dentry = d_lookup(parent, name); 86 dentry = d_lookup(parent, name);
81 if (dentry) { 87 if (dentry) {
82 /* FIXME: check for inode number changes? */ 88 /* FIXME: check for inode number changes? */
@@ -666,12 +672,11 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
666 min(len, max_len), nlt, 672 min(len, max_len), nlt,
667 cifs_sb->mnt_cifs_flags & 673 cifs_sb->mnt_cifs_flags &
668 CIFS_MOUNT_MAP_SPECIAL_CHR); 674 CIFS_MOUNT_MAP_SPECIAL_CHR);
675 pqst->len -= nls_nullsize(nlt);
669 } else { 676 } else {
670 pqst->name = filename; 677 pqst->name = filename;
671 pqst->len = len; 678 pqst->len = len;
672 } 679 }
673 pqst->hash = full_name_hash(pqst->name, pqst->len);
674/* cFYI(1, ("filldir on %s",pqst->name)); */
675 return rc; 680 return rc;
676} 681}
677 682
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7085a6275c4c..7c3fd7463f44 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -29,6 +29,7 @@
29#include "ntlmssp.h" 29#include "ntlmssp.h"
30#include "nterr.h" 30#include "nterr.h"
31#include <linux/utsname.h> 31#include <linux/utsname.h>
32#include <linux/slab.h>
32#include "cifs_spnego.h" 33#include "cifs_spnego.h"
33 34
34extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, 35extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
@@ -223,9 +224,9 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
223 /* null user mount */ 224 /* null user mount */
224 *bcc_ptr = 0; 225 *bcc_ptr = 0;
225 *(bcc_ptr+1) = 0; 226 *(bcc_ptr+1) = 0;
226 } else { /* 300 should be long enough for any conceivable user name */ 227 } else {
227 bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName, 228 bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName,
228 300, nls_cp); 229 MAX_USERNAME_SIZE, nls_cp);
229 } 230 }
230 bcc_ptr += 2 * bytes_ret; 231 bcc_ptr += 2 * bytes_ret;
231 bcc_ptr += 2; /* account for null termination */ 232 bcc_ptr += 2; /* account for null termination */
@@ -246,11 +247,10 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
246 /* copy user */ 247 /* copy user */
247 if (ses->userName == NULL) { 248 if (ses->userName == NULL) {
248 /* BB what about null user mounts - check that we do this BB */ 249 /* BB what about null user mounts - check that we do this BB */
249 } else { /* 300 should be long enough for any conceivable user name */ 250 } else {
250 strncpy(bcc_ptr, ses->userName, 300); 251 strncpy(bcc_ptr, ses->userName, MAX_USERNAME_SIZE);
251 } 252 }
252 /* BB improve check for overflow */ 253 bcc_ptr += strnlen(ses->userName, MAX_USERNAME_SIZE);
253 bcc_ptr += strnlen(ses->userName, 300);
254 *bcc_ptr = 0; 254 *bcc_ptr = 0;
255 bcc_ptr++; /* account for null termination */ 255 bcc_ptr++; /* account for null termination */
256 256
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 93fb09a99c69..192ea51af20f 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -24,6 +24,7 @@
24*/ 24*/
25 25
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/slab.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
28#include <linux/string.h> 29#include <linux/string.h>
29#include <linux/kernel.h> 30#include <linux/kernel.h>
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 07b8e71544ee..ad081fe7eb18 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/gfp.h>
25#include <linux/wait.h> 26#include <linux/wait.h>
26#include <linux/net.h> 27#include <linux/net.h>
27#include <linux/delay.h> 28#include <linux/delay.h>
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a75afa3dd9e1..f555ce077d4f 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -21,6 +21,7 @@
21 21
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/posix_acl_xattr.h> 23#include <linux/posix_acl_xattr.h>
24#include <linux/slab.h>
24#include "cifsfs.h" 25#include "cifsfs.h"
25#include "cifspdu.h" 26#include "cifspdu.h"
26#include "cifsglob.h" 27#include "cifsglob.h"
@@ -244,7 +245,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
244 /* revalidate/getattr then populate from inode */ 245 /* revalidate/getattr then populate from inode */
245 } /* BB add else when above is implemented */ 246 } /* BB add else when above is implemented */
246 ea_name += 5; /* skip past user. prefix */ 247 ea_name += 5; /* skip past user. prefix */
247 rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value, 248 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
248 buf_size, cifs_sb->local_nls, 249 buf_size, cifs_sb->local_nls,
249 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 250 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
250 } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) { 251 } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
@@ -252,7 +253,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
252 goto get_ea_exit; 253 goto get_ea_exit;
253 254
254 ea_name += 4; /* skip past os2. prefix */ 255 ea_name += 4; /* skip past os2. prefix */
255 rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value, 256 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
256 buf_size, cifs_sb->local_nls, 257 buf_size, cifs_sb->local_nls,
257 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 258 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
258 } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, 259 } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
@@ -364,8 +365,8 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
364 /* if proc/fs/cifs/streamstoxattr is set then 365 /* if proc/fs/cifs/streamstoxattr is set then
365 search server for EAs or streams to 366 search server for EAs or streams to
366 returns as xattrs */ 367 returns as xattrs */
367 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, data, buf_size, 368 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data,
368 cifs_sb->local_nls, 369 buf_size, cifs_sb->local_nls,
369 cifs_sb->mnt_cifs_flags & 370 cifs_sb->mnt_cifs_flags &
370 CIFS_MOUNT_MAP_SPECIAL_CHR); 371 CIFS_MOUNT_MAP_SPECIAL_CHR);
371 372
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 4bb9d0a5decc..ccd98b0f2b0b 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -12,6 +12,7 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/slab.h>
15#include <linux/file.h> 16#include <linux/file.h>
16#include <linux/stat.h> 17#include <linux/stat.h>
17#include <linux/errno.h> 18#include <linux/errno.h>
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ffd42815fda1..4c813f2cdc52 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -17,6 +17,7 @@
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/smp_lock.h> 18#include <linux/smp_lock.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/slab.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21 22
22#include <linux/coda.h> 23#include <linux/coda.h>
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 830f51abb971..d97f9935a028 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -18,6 +18,7 @@
18#include <linux/smp_lock.h> 18#include <linux/smp_lock.h>
19#include <linux/file.h> 19#include <linux/file.h>
20#include <linux/vfs.h> 20#include <linux/vfs.h>
21#include <linux/slab.h>
21 22
22#include <asm/system.h> 23#include <asm/system.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
@@ -166,6 +167,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
166 return -EBUSY; 167 return -EBUSY;
167 } 168 }
168 169
170 error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
171 if (error)
172 goto bdi_err;
173
169 vc->vc_sb = sb; 174 vc->vc_sb = sb;
170 175
171 sb->s_fs_info = vc; 176 sb->s_fs_info = vc;
@@ -174,6 +179,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
174 sb->s_blocksize_bits = 12; 179 sb->s_blocksize_bits = 12;
175 sb->s_magic = CODA_SUPER_MAGIC; 180 sb->s_magic = CODA_SUPER_MAGIC;
176 sb->s_op = &coda_super_operations; 181 sb->s_op = &coda_super_operations;
182 sb->s_bdi = &vc->bdi;
177 183
178 /* get root fid from Venus: this needs the root inode */ 184 /* get root fid from Venus: this needs the root inode */
179 error = venus_rootfid(sb, &fid); 185 error = venus_rootfid(sb, &fid);
@@ -199,6 +205,8 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
199 return 0; 205 return 0;
200 206
201 error: 207 error:
208 bdi_destroy(&vc->bdi);
209 bdi_err:
202 if (root) 210 if (root)
203 iput(root); 211 iput(root);
204 if (vc) 212 if (vc)
@@ -209,6 +217,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
209 217
210static void coda_put_super(struct super_block *sb) 218static void coda_put_super(struct super_block *sb)
211{ 219{
220 bdi_destroy(&coda_vcp(sb)->bdi);
212 coda_vcp(sb)->vc_sb = NULL; 221 coda_vcp(sb)->vc_sb = NULL;
213 sb->s_fs_info = NULL; 222 sb->s_fs_info = NULL;
214 223
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c274d949179d..f09c5ed76f6c 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -26,6 +26,7 @@
26#include <linux/stat.h> 26#include <linux/stat.h>
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/slab.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <linux/vmalloc.h> 31#include <linux/vmalloc.h>
31#include <linux/vfs.h> 32#include <linux/vfs.h>
diff --git a/fs/compat.c b/fs/compat.c
index 00d90c2e66f0..4b6ed03cc478 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -49,6 +49,7 @@
49#include <linux/mm.h> 49#include <linux/mm.h>
50#include <linux/eventpoll.h> 50#include <linux/eventpoll.h>
51#include <linux/fs_struct.h> 51#include <linux/fs_struct.h>
52#include <linux/slab.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/mmu_context.h> 55#include <asm/mmu_context.h>
@@ -1795,6 +1796,24 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
1795 return ret; 1796 return ret;
1796} 1797}
1797 1798
1799struct compat_sel_arg_struct {
1800 compat_ulong_t n;
1801 compat_uptr_t inp;
1802 compat_uptr_t outp;
1803 compat_uptr_t exp;
1804 compat_uptr_t tvp;
1805};
1806
1807asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg)
1808{
1809 struct compat_sel_arg_struct a;
1810
1811 if (copy_from_user(&a, arg, sizeof(a)))
1812 return -EFAULT;
1813 return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
1814 compat_ptr(a.exp), compat_ptr(a.tvp));
1815}
1816
1798#ifdef HAVE_SET_RESTORE_SIGMASK 1817#ifdef HAVE_SET_RESTORE_SIGMASK
1799static long do_compat_pselect(int n, compat_ulong_t __user *inp, 1818static long do_compat_pselect(int n, compat_ulong_t __user *inp,
1800 compat_ulong_t __user *outp, compat_ulong_t __user *exp, 1819 compat_ulong_t __user *outp, compat_ulong_t __user *exp,
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 0adced2f296f..112e45a17e99 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -28,10 +28,12 @@
28 28
29#undef elfhdr 29#undef elfhdr
30#undef elf_phdr 30#undef elf_phdr
31#undef elf_shdr
31#undef elf_note 32#undef elf_note
32#undef elf_addr_t 33#undef elf_addr_t
33#define elfhdr elf32_hdr 34#define elfhdr elf32_hdr
34#define elf_phdr elf32_phdr 35#define elf_phdr elf32_phdr
36#define elf_shdr elf32_shdr
35#define elf_note elf32_note 37#define elf_note elf32_note
36#define elf_addr_t Elf32_Addr 38#define elf_addr_t Elf32_Addr
37 39
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 332dd00f0894..c32a1b6a856b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,6 @@
23#include <linux/ioctl.h> 23#include <linux/ioctl.h>
24#include <linux/if.h> 24#include <linux/if.h>
25#include <linux/if_bridge.h> 25#include <linux/if_bridge.h>
26#include <linux/slab.h>
27#include <linux/raid/md_u.h> 26#include <linux/raid/md_u.h>
28#include <linux/kd.h> 27#include <linux/kd.h>
29#include <linux/route.h> 28#include <linux/route.h>
@@ -60,6 +59,7 @@
60#include <linux/i2c.h> 59#include <linux/i2c.h>
61#include <linux/i2c-dev.h> 60#include <linux/i2c-dev.h>
62#include <linux/atalk.h> 61#include <linux/atalk.h>
62#include <linux/gfp.h>
63 63
64#include <net/bluetooth/bluetooth.h> 64#include <net/bluetooth/bluetooth.h>
65#include <net/bluetooth/hci.h> 65#include <net/bluetooth/hci.h>
@@ -301,6 +301,12 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
301 u32 data; 301 u32 data;
302 void __user *dxferp; 302 void __user *dxferp;
303 int err; 303 int err;
304 int interface_id;
305
306 if (get_user(interface_id, &sgio32->interface_id))
307 return -EFAULT;
308 if (interface_id != 'S')
309 return sys_ioctl(fd, cmd, (unsigned long)sgio32);
304 310
305 if (get_user(iovec_count, &sgio32->iovec_count)) 311 if (get_user(iovec_count, &sgio32->iovec_count))
306 return -EFAULT; 312 return -EFAULT;
@@ -539,7 +545,7 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
539 kcmd = MTIOCPOS; 545 kcmd = MTIOCPOS;
540 karg = &pos; 546 karg = &pos;
541 break; 547 break;
542 case MTIOCGET32: 548 default: /* MTIOCGET32 */
543 kcmd = MTIOCGET; 549 kcmd = MTIOCGET;
544 karg = &get; 550 karg = &get;
545 break; 551 break;
@@ -657,7 +663,7 @@ static int raw_ioctl(unsigned fd, unsigned cmd,
657 663
658 switch (cmd) { 664 switch (cmd) {
659 case RAW_SETBIND: 665 case RAW_SETBIND:
660 case RAW_GETBIND: { 666 default: { /* RAW_GETBIND */
661 struct raw_config_request req; 667 struct raw_config_request req;
662 mm_segment_t oldfs = get_fs(); 668 mm_segment_t oldfs = get_fs();
663 669
@@ -936,6 +942,7 @@ COMPATIBLE_IOCTL(TCSETSF)
936COMPATIBLE_IOCTL(TIOCLINUX) 942COMPATIBLE_IOCTL(TIOCLINUX)
937COMPATIBLE_IOCTL(TIOCSBRK) 943COMPATIBLE_IOCTL(TIOCSBRK)
938COMPATIBLE_IOCTL(TIOCCBRK) 944COMPATIBLE_IOCTL(TIOCCBRK)
945COMPATIBLE_IOCTL(TIOCGSID)
939COMPATIBLE_IOCTL(TIOCGICOUNT) 946COMPATIBLE_IOCTL(TIOCGICOUNT)
940/* Little t */ 947/* Little t */
941COMPATIBLE_IOCTL(TIOCGETD) 948COMPATIBLE_IOCTL(TIOCGETD)
@@ -1005,6 +1012,9 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
1005COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) 1012COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
1006COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) 1013COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
1007#endif 1014#endif
1015/* Big V (don't complain on serial console) */
1016IGNORE_IOCTL(VT_OPENQRY)
1017IGNORE_IOCTL(VT_GETMODE)
1008/* Little p (/dev/rtc, /dev/envctrl, etc.) */ 1018/* Little p (/dev/rtc, /dev/envctrl, etc.) */
1009COMPATIBLE_IOCTL(RTC_AIE_ON) 1019COMPATIBLE_IOCTL(RTC_AIE_ON)
1010COMPATIBLE_IOCTL(RTC_AIE_OFF) 1020COMPATIBLE_IOCTL(RTC_AIE_OFF)
@@ -1035,6 +1045,8 @@ COMPATIBLE_IOCTL(FIOQSIZE)
1035#ifdef CONFIG_BLOCK 1045#ifdef CONFIG_BLOCK
1036/* loop */ 1046/* loop */
1037IGNORE_IOCTL(LOOP_CLR_FD) 1047IGNORE_IOCTL(LOOP_CLR_FD)
1048/* md calls this on random blockdevs */
1049IGNORE_IOCTL(RAID_VERSION)
1038/* SG stuff */ 1050/* SG stuff */
1039COMPATIBLE_IOCTL(SG_SET_TIMEOUT) 1051COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
1040COMPATIBLE_IOCTL(SG_GET_TIMEOUT) 1052COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index a2f746066c5d..c8af2d91174b 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -34,6 +34,7 @@
34#include <linux/capability.h> 34#include <linux/capability.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/lockdep.h> 36#include <linux/lockdep.h>
37#include <linux/slab.h>
37 38
38#include <linux/configfs.h> 39#include <linux/configfs.h>
39#include "configfs_internal.h" 40#include "configfs_internal.h"
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8421cea7d8c7..8c8d64230c2d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -29,6 +29,7 @@
29#include <linux/mount.h> 29#include <linux/mount.h>
30#include <linux/pagemap.h> 30#include <linux/pagemap.h>
31#include <linux/init.h> 31#include <linux/init.h>
32#include <linux/slab.h>
32 33
33#include <linux/configfs.h> 34#include <linux/configfs.h>
34#include "configfs_internal.h" 35#include "configfs_internal.h"
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index c8afa6b1d91d..0f3eb41d9201 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -27,6 +27,7 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/namei.h> 29#include <linux/namei.h>
30#include <linux/slab.h>
30 31
31#include <linux/configfs.h> 32#include <linux/configfs.h>
32#include "configfs_internal.h" 33#include "configfs_internal.h"
@@ -121,8 +122,10 @@ static int get_target(const char *symname, struct path *path,
121 ret = -ENOENT; 122 ret = -ENOENT;
122 path_put(path); 123 path_put(path);
123 } 124 }
124 } else 125 } else {
125 ret = -EPERM; 126 ret = -EPERM;
127 path_put(path);
128 }
126 } 129 }
127 130
128 return ret; 131 return ret;
diff --git a/fs/dcache.c b/fs/dcache.c
index 953173a293a9..f1358e5c3a59 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -257,6 +257,7 @@ kill_it:
257 if (dentry) 257 if (dentry)
258 goto repeat; 258 goto repeat;
259} 259}
260EXPORT_SYMBOL(dput);
260 261
261/** 262/**
262 * d_invalidate - invalidate a dentry 263 * d_invalidate - invalidate a dentry
@@ -314,6 +315,7 @@ int d_invalidate(struct dentry * dentry)
314 spin_unlock(&dcache_lock); 315 spin_unlock(&dcache_lock);
315 return 0; 316 return 0;
316} 317}
318EXPORT_SYMBOL(d_invalidate);
317 319
318/* This should be called _only_ with dcache_lock held */ 320/* This should be called _only_ with dcache_lock held */
319 321
@@ -328,6 +330,7 @@ struct dentry * dget_locked(struct dentry *dentry)
328{ 330{
329 return __dget_locked(dentry); 331 return __dget_locked(dentry);
330} 332}
333EXPORT_SYMBOL(dget_locked);
331 334
332/** 335/**
333 * d_find_alias - grab a hashed alias of inode 336 * d_find_alias - grab a hashed alias of inode
@@ -384,6 +387,7 @@ struct dentry * d_find_alias(struct inode *inode)
384 } 387 }
385 return de; 388 return de;
386} 389}
390EXPORT_SYMBOL(d_find_alias);
387 391
388/* 392/*
389 * Try to kill dentries associated with this inode. 393 * Try to kill dentries associated with this inode.
@@ -408,6 +412,7 @@ restart:
408 } 412 }
409 spin_unlock(&dcache_lock); 413 spin_unlock(&dcache_lock);
410} 414}
415EXPORT_SYMBOL(d_prune_aliases);
411 416
412/* 417/*
413 * Throw away a dentry - free the inode, dput the parent. This requires that 418 * Throw away a dentry - free the inode, dput the parent. This requires that
@@ -610,6 +615,7 @@ void shrink_dcache_sb(struct super_block * sb)
610{ 615{
611 __shrink_dcache_sb(sb, NULL, 0); 616 __shrink_dcache_sb(sb, NULL, 0);
612} 617}
618EXPORT_SYMBOL(shrink_dcache_sb);
613 619
614/* 620/*
615 * destroy a single subtree of dentries for unmount 621 * destroy a single subtree of dentries for unmount
@@ -792,6 +798,7 @@ positive:
792 spin_unlock(&dcache_lock); 798 spin_unlock(&dcache_lock);
793 return 1; 799 return 1;
794} 800}
801EXPORT_SYMBOL(have_submounts);
795 802
796/* 803/*
797 * Search the dentry child list for the specified parent, 804 * Search the dentry child list for the specified parent,
@@ -876,6 +883,7 @@ void shrink_dcache_parent(struct dentry * parent)
876 while ((found = select_parent(parent)) != 0) 883 while ((found = select_parent(parent)) != 0)
877 __shrink_dcache_sb(sb, &found, 0); 884 __shrink_dcache_sb(sb, &found, 0);
878} 885}
886EXPORT_SYMBOL(shrink_dcache_parent);
879 887
880/* 888/*
881 * Scan `nr' dentries and return the number which remain. 889 * Scan `nr' dentries and return the number which remain.
@@ -968,6 +976,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
968 976
969 return dentry; 977 return dentry;
970} 978}
979EXPORT_SYMBOL(d_alloc);
971 980
972struct dentry *d_alloc_name(struct dentry *parent, const char *name) 981struct dentry *d_alloc_name(struct dentry *parent, const char *name)
973{ 982{
@@ -1012,6 +1021,7 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
1012 spin_unlock(&dcache_lock); 1021 spin_unlock(&dcache_lock);
1013 security_d_instantiate(entry, inode); 1022 security_d_instantiate(entry, inode);
1014} 1023}
1024EXPORT_SYMBOL(d_instantiate);
1015 1025
1016/** 1026/**
1017 * d_instantiate_unique - instantiate a non-aliased dentry 1027 * d_instantiate_unique - instantiate a non-aliased dentry
@@ -1108,6 +1118,7 @@ struct dentry * d_alloc_root(struct inode * root_inode)
1108 } 1118 }
1109 return res; 1119 return res;
1110} 1120}
1121EXPORT_SYMBOL(d_alloc_root);
1111 1122
1112static inline struct hlist_head *d_hash(struct dentry *parent, 1123static inline struct hlist_head *d_hash(struct dentry *parent,
1113 unsigned long hash) 1124 unsigned long hash)
@@ -1211,7 +1222,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1211 BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); 1222 BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
1212 spin_unlock(&dcache_lock); 1223 spin_unlock(&dcache_lock);
1213 security_d_instantiate(new, inode); 1224 security_d_instantiate(new, inode);
1214 d_rehash(dentry);
1215 d_move(new, dentry); 1225 d_move(new, dentry);
1216 iput(inode); 1226 iput(inode);
1217 } else { 1227 } else {
@@ -1225,6 +1235,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1225 d_add(dentry, inode); 1235 d_add(dentry, inode);
1226 return new; 1236 return new;
1227} 1237}
1238EXPORT_SYMBOL(d_splice_alias);
1228 1239
1229/** 1240/**
1230 * d_add_ci - lookup or allocate new dentry with case-exact name 1241 * d_add_ci - lookup or allocate new dentry with case-exact name
@@ -1314,6 +1325,7 @@ err_out:
1314 iput(inode); 1325 iput(inode);
1315 return ERR_PTR(error); 1326 return ERR_PTR(error);
1316} 1327}
1328EXPORT_SYMBOL(d_add_ci);
1317 1329
1318/** 1330/**
1319 * d_lookup - search for a dentry 1331 * d_lookup - search for a dentry
@@ -1357,6 +1369,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
1357 } while (read_seqretry(&rename_lock, seq)); 1369 } while (read_seqretry(&rename_lock, seq));
1358 return dentry; 1370 return dentry;
1359} 1371}
1372EXPORT_SYMBOL(d_lookup);
1360 1373
1361struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) 1374struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1362{ 1375{
@@ -1483,6 +1496,7 @@ int d_validate(struct dentry *dentry, struct dentry *dparent)
1483out: 1496out:
1484 return 0; 1497 return 0;
1485} 1498}
1499EXPORT_SYMBOL(d_validate);
1486 1500
1487/* 1501/*
1488 * When a file is deleted, we have two options: 1502 * When a file is deleted, we have two options:
@@ -1528,6 +1542,7 @@ void d_delete(struct dentry * dentry)
1528 1542
1529 fsnotify_nameremove(dentry, isdir); 1543 fsnotify_nameremove(dentry, isdir);
1530} 1544}
1545EXPORT_SYMBOL(d_delete);
1531 1546
1532static void __d_rehash(struct dentry * entry, struct hlist_head *list) 1547static void __d_rehash(struct dentry * entry, struct hlist_head *list)
1533{ 1548{
@@ -1556,6 +1571,7 @@ void d_rehash(struct dentry * entry)
1556 spin_unlock(&entry->d_lock); 1571 spin_unlock(&entry->d_lock);
1557 spin_unlock(&dcache_lock); 1572 spin_unlock(&dcache_lock);
1558} 1573}
1574EXPORT_SYMBOL(d_rehash);
1559 1575
1560/* 1576/*
1561 * When switching names, the actual string doesn't strictly have to 1577 * When switching names, the actual string doesn't strictly have to
@@ -1702,6 +1718,7 @@ void d_move(struct dentry * dentry, struct dentry * target)
1702 d_move_locked(dentry, target); 1718 d_move_locked(dentry, target);
1703 spin_unlock(&dcache_lock); 1719 spin_unlock(&dcache_lock);
1704} 1720}
1721EXPORT_SYMBOL(d_move);
1705 1722
1706/** 1723/**
1707 * d_ancestor - search for an ancestor 1724 * d_ancestor - search for an ancestor
@@ -1868,6 +1885,7 @@ shouldnt_be_hashed:
1868 spin_unlock(&dcache_lock); 1885 spin_unlock(&dcache_lock);
1869 BUG(); 1886 BUG();
1870} 1887}
1888EXPORT_SYMBOL_GPL(d_materialise_unique);
1871 1889
1872static int prepend(char **buffer, int *buflen, const char *str, int namelen) 1890static int prepend(char **buffer, int *buflen, const char *str, int namelen)
1873{ 1891{
@@ -2005,6 +2023,7 @@ char *d_path(const struct path *path, char *buf, int buflen)
2005 path_put(&root); 2023 path_put(&root);
2006 return res; 2024 return res;
2007} 2025}
2026EXPORT_SYMBOL(d_path);
2008 2027
2009/* 2028/*
2010 * Helper function for dentry_operations.d_dname() members 2029 * Helper function for dentry_operations.d_dname() members
@@ -2171,6 +2190,30 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
2171 return result; 2190 return result;
2172} 2191}
2173 2192
2193int path_is_under(struct path *path1, struct path *path2)
2194{
2195 struct vfsmount *mnt = path1->mnt;
2196 struct dentry *dentry = path1->dentry;
2197 int res;
2198 spin_lock(&vfsmount_lock);
2199 if (mnt != path2->mnt) {
2200 for (;;) {
2201 if (mnt->mnt_parent == mnt) {
2202 spin_unlock(&vfsmount_lock);
2203 return 0;
2204 }
2205 if (mnt->mnt_parent == path2->mnt)
2206 break;
2207 mnt = mnt->mnt_parent;
2208 }
2209 dentry = mnt->mnt_mountpoint;
2210 }
2211 res = is_subdir(dentry, path2->dentry);
2212 spin_unlock(&vfsmount_lock);
2213 return res;
2214}
2215EXPORT_SYMBOL(path_is_under);
2216
2174void d_genocide(struct dentry *root) 2217void d_genocide(struct dentry *root)
2175{ 2218{
2176 struct dentry *this_parent = root; 2219 struct dentry *this_parent = root;
@@ -2228,6 +2271,7 @@ ino_t find_inode_number(struct dentry *dir, struct qstr *name)
2228 } 2271 }
2229 return ino; 2272 return ino;
2230} 2273}
2274EXPORT_SYMBOL(find_inode_number);
2231 2275
2232static __initdata unsigned long dhash_entries; 2276static __initdata unsigned long dhash_entries;
2233static int __init set_dhash_entries(char *str) 2277static int __init set_dhash_entries(char *str)
@@ -2297,6 +2341,7 @@ static void __init dcache_init(void)
2297 2341
2298/* SLAB cache for __getname() consumers */ 2342/* SLAB cache for __getname() consumers */
2299struct kmem_cache *names_cachep __read_mostly; 2343struct kmem_cache *names_cachep __read_mostly;
2344EXPORT_SYMBOL(names_cachep);
2300 2345
2301EXPORT_SYMBOL(d_genocide); 2346EXPORT_SYMBOL(d_genocide);
2302 2347
@@ -2326,26 +2371,3 @@ void __init vfs_caches_init(unsigned long mempages)
2326 bdev_cache_init(); 2371 bdev_cache_init();
2327 chrdev_init(); 2372 chrdev_init();
2328} 2373}
2329
2330EXPORT_SYMBOL(d_alloc);
2331EXPORT_SYMBOL(d_alloc_root);
2332EXPORT_SYMBOL(d_delete);
2333EXPORT_SYMBOL(d_find_alias);
2334EXPORT_SYMBOL(d_instantiate);
2335EXPORT_SYMBOL(d_invalidate);
2336EXPORT_SYMBOL(d_lookup);
2337EXPORT_SYMBOL(d_move);
2338EXPORT_SYMBOL_GPL(d_materialise_unique);
2339EXPORT_SYMBOL(d_path);
2340EXPORT_SYMBOL(d_prune_aliases);
2341EXPORT_SYMBOL(d_rehash);
2342EXPORT_SYMBOL(d_splice_alias);
2343EXPORT_SYMBOL(d_add_ci);
2344EXPORT_SYMBOL(d_validate);
2345EXPORT_SYMBOL(dget_locked);
2346EXPORT_SYMBOL(dput);
2347EXPORT_SYMBOL(find_inode_number);
2348EXPORT_SYMBOL(have_submounts);
2349EXPORT_SYMBOL(names_cachep);
2350EXPORT_SYMBOL(shrink_dcache_parent);
2351EXPORT_SYMBOL(shrink_dcache_sb);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b486169f42bf..30a87b3dbcac 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -27,6 +27,7 @@
27#include <linux/fsnotify.h> 27#include <linux/fsnotify.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/magic.h> 29#include <linux/magic.h>
30#include <linux/slab.h>
30 31
31static struct vfsmount *debugfs_mount; 32static struct vfsmount *debugfs_mount;
32static int debugfs_mount_count; 33static int debugfs_mount_count;
@@ -160,15 +161,8 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
160 * block. A pointer to that is in the struct vfsmount that we 161 * block. A pointer to that is in the struct vfsmount that we
161 * have around. 162 * have around.
162 */ 163 */
163 if (!parent) { 164 if (!parent)
164 if (debugfs_mount && debugfs_mount->mnt_sb) { 165 parent = debugfs_mount->mnt_sb->s_root;
165 parent = debugfs_mount->mnt_sb->s_root;
166 }
167 }
168 if (!parent) {
169 pr_debug("debugfs: Ah! can not find a parent!\n");
170 return -EFAULT;
171 }
172 166
173 *dentry = NULL; 167 *dentry = NULL;
174 mutex_lock(&parent->d_inode->i_mutex); 168 mutex_lock(&parent->d_inode->i_mutex);
@@ -503,7 +497,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
503 } 497 }
504 d_move(old_dentry, dentry); 498 d_move(old_dentry, dentry);
505 fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name, 499 fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name,
506 old_dentry->d_name.name, S_ISDIR(old_dentry->d_inode->i_mode), 500 S_ISDIR(old_dentry->d_inode->i_mode),
507 NULL, old_dentry); 501 NULL, old_dentry);
508 fsnotify_oldname_free(old_name); 502 fsnotify_oldname_free(old_name);
509 unlock_rename(new_dir, old_dir); 503 unlock_rename(new_dir, old_dir);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8882ecc0f1bf..0120247b41c0 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -15,6 +15,7 @@
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
18#include <linux/mount.h> 19#include <linux/mount.h>
19#include <linux/tty.h> 20#include <linux/tty.h>
20#include <linux/mutex.h> 21#include <linux/mutex.h>
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index dc2ad6008b2d..4314f0d48d85 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
33 spin_unlock(&ast_queue_lock); 33 spin_unlock(&ast_queue_lock);
34} 34}
35 35
36void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode) 36void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode)
37{ 37{
38 if (lkb->lkb_flags & DLM_IFL_USER) { 38 if (lkb->lkb_flags & DLM_IFL_USER) {
39 dlm_user_add_ast(lkb, type, bastmode); 39 dlm_user_add_ast(lkb, type, mode);
40 return; 40 return;
41 } 41 }
42 42
@@ -44,10 +44,21 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
44 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) { 44 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
45 kref_get(&lkb->lkb_ref); 45 kref_get(&lkb->lkb_ref);
46 list_add_tail(&lkb->lkb_astqueue, &ast_queue); 46 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
47 lkb->lkb_ast_first = type;
47 } 48 }
49
50 /* sanity check, this should not happen */
51
52 if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP))
53 log_print("repeat cast %d castmode %d lock %x %s",
54 mode, lkb->lkb_castmode,
55 lkb->lkb_id, lkb->lkb_resource->res_name);
56
48 lkb->lkb_ast_type |= type; 57 lkb->lkb_ast_type |= type;
49 if (bastmode) 58 if (type == AST_BAST)
50 lkb->lkb_bastmode = bastmode; 59 lkb->lkb_bastmode = mode;
60 else
61 lkb->lkb_castmode = mode;
51 spin_unlock(&ast_queue_lock); 62 spin_unlock(&ast_queue_lock);
52 63
53 set_bit(WAKE_ASTS, &astd_wakeflags); 64 set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,9 +70,9 @@ static void process_asts(void)
59 struct dlm_ls *ls = NULL; 70 struct dlm_ls *ls = NULL;
60 struct dlm_rsb *r = NULL; 71 struct dlm_rsb *r = NULL;
61 struct dlm_lkb *lkb; 72 struct dlm_lkb *lkb;
62 void (*cast) (void *astparam); 73 void (*castfn) (void *astparam);
63 void (*bast) (void *astparam, int mode); 74 void (*bastfn) (void *astparam, int mode);
64 int type = 0, bastmode; 75 int type, first, bastmode, castmode, do_bast, do_cast, last_castmode;
65 76
66repeat: 77repeat:
67 spin_lock(&ast_queue_lock); 78 spin_lock(&ast_queue_lock);
@@ -75,17 +86,48 @@ repeat:
75 list_del(&lkb->lkb_astqueue); 86 list_del(&lkb->lkb_astqueue);
76 type = lkb->lkb_ast_type; 87 type = lkb->lkb_ast_type;
77 lkb->lkb_ast_type = 0; 88 lkb->lkb_ast_type = 0;
89 first = lkb->lkb_ast_first;
90 lkb->lkb_ast_first = 0;
78 bastmode = lkb->lkb_bastmode; 91 bastmode = lkb->lkb_bastmode;
79 92 castmode = lkb->lkb_castmode;
93 castfn = lkb->lkb_astfn;
94 bastfn = lkb->lkb_bastfn;
80 spin_unlock(&ast_queue_lock); 95 spin_unlock(&ast_queue_lock);
81 cast = lkb->lkb_astfn;
82 bast = lkb->lkb_bastfn;
83
84 if ((type & AST_COMP) && cast)
85 cast(lkb->lkb_astparam);
86 96
87 if ((type & AST_BAST) && bast) 97 do_cast = (type & AST_COMP) && castfn;
88 bast(lkb->lkb_astparam, bastmode); 98 do_bast = (type & AST_BAST) && bastfn;
99
100 /* Skip a bast if its blocking mode is compatible with the
101 granted mode of the preceding cast. */
102
103 if (do_bast) {
104 if (first == AST_COMP)
105 last_castmode = castmode;
106 else
107 last_castmode = lkb->lkb_castmode_done;
108 if (dlm_modes_compat(bastmode, last_castmode))
109 do_bast = 0;
110 }
111
112 if (first == AST_COMP) {
113 if (do_cast)
114 castfn(lkb->lkb_astparam);
115 if (do_bast)
116 bastfn(lkb->lkb_astparam, bastmode);
117 } else if (first == AST_BAST) {
118 if (do_bast)
119 bastfn(lkb->lkb_astparam, bastmode);
120 if (do_cast)
121 castfn(lkb->lkb_astparam);
122 } else {
123 log_error(ls, "bad ast_first %d ast_type %d",
124 first, type);
125 }
126
127 if (do_cast)
128 lkb->lkb_castmode_done = castmode;
129 if (do_bast)
130 lkb->lkb_bastmode_done = bastmode;
89 131
90 /* this removes the reference added by dlm_add_ast 132 /* this removes the reference added by dlm_add_ast
91 and may result in the lkb being freed */ 133 and may result in the lkb being freed */
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 1b5fc5f428fd..bcb1aaba519d 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
13#ifndef __ASTD_DOT_H__ 13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__ 14#define __ASTD_DOT_H__
15 15
16void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode); 16void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode);
17void dlm_del_ast(struct dlm_lkb *lkb); 17void dlm_del_ast(struct dlm_lkb *lkb);
18 18
19void dlm_astd_wake(void); 19void dlm_astd_wake(void);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 0df243850818..b54bca03d92f 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -14,6 +14,7 @@
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/configfs.h> 16#include <linux/configfs.h>
17#include <linux/slab.h>
17#include <linux/in.h> 18#include <linux/in.h>
18#include <linux/in6.h> 19#include <linux/in6.h>
19#include <net/ipv6.h> 20#include <net/ipv6.h>
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 375a2359b3bf..c6cf25158746 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/debugfs.h> 17#include <linux/debugfs.h>
18#include <linux/slab.h>
18 19
19#include "dlm_internal.h" 20#include "dlm_internal.h"
20#include "lock.h" 21#include "lock.h"
@@ -256,7 +257,7 @@ static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
256 lkb->lkb_status, 257 lkb->lkb_status,
257 lkb->lkb_grmode, 258 lkb->lkb_grmode,
258 lkb->lkb_rqmode, 259 lkb->lkb_rqmode,
259 lkb->lkb_highbast, 260 lkb->lkb_bastmode,
260 rsb_lookup, 261 rsb_lookup,
261 lkb->lkb_wait_type, 262 lkb->lkb_wait_type,
262 lkb->lkb_lvbseq, 263 lkb->lkb_lvbseq,
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 826d3dc6e0ab..f632b58cd222 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -232,11 +232,17 @@ struct dlm_lkb {
232 int8_t lkb_status; /* granted, waiting, convert */ 232 int8_t lkb_status; /* granted, waiting, convert */
233 int8_t lkb_rqmode; /* requested lock mode */ 233 int8_t lkb_rqmode; /* requested lock mode */
234 int8_t lkb_grmode; /* granted lock mode */ 234 int8_t lkb_grmode; /* granted lock mode */
235 int8_t lkb_bastmode; /* requested mode */
236 int8_t lkb_highbast; /* highest mode bast sent for */ 235 int8_t lkb_highbast; /* highest mode bast sent for */
236
237 int8_t lkb_wait_type; /* type of reply waiting for */ 237 int8_t lkb_wait_type; /* type of reply waiting for */
238 int8_t lkb_wait_count; 238 int8_t lkb_wait_count;
239 int8_t lkb_ast_type; /* type of ast queued for */ 239 int8_t lkb_ast_type; /* type of ast queued for */
240 int8_t lkb_ast_first; /* type of first ast queued */
241
242 int8_t lkb_bastmode; /* req mode of queued bast */
243 int8_t lkb_castmode; /* gr mode of queued cast */
244 int8_t lkb_bastmode_done; /* last delivered bastmode */
245 int8_t lkb_castmode_done; /* last delivered castmode */
240 246
241 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ 247 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
242 struct list_head lkb_statequeue; /* rsb g/c/w list */ 248 struct list_head lkb_statequeue; /* rsb g/c/w list */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 9c0c1db1e105..17903b491298 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -56,6 +56,7 @@
56 L: receive_xxxx_reply() <- R: send_xxxx_reply() 56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/ 57*/
58#include <linux/types.h> 58#include <linux/types.h>
59#include <linux/slab.h>
59#include "dlm_internal.h" 60#include "dlm_internal.h"
60#include <linux/dlm_device.h> 61#include <linux/dlm_device.h>
61#include "memory.h" 62#include "memory.h"
@@ -307,7 +308,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
307 lkb->lkb_lksb->sb_status = rv; 308 lkb->lkb_lksb->sb_status = rv;
308 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags; 309 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
309 310
310 dlm_add_ast(lkb, AST_COMP, 0); 311 dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode);
311} 312}
312 313
313static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) 314static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -320,10 +321,12 @@ static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
320{ 321{
321 lkb->lkb_time_bast = ktime_get(); 322 lkb->lkb_time_bast = ktime_get();
322 323
323 if (is_master_copy(lkb)) 324 if (is_master_copy(lkb)) {
325 lkb->lkb_bastmode = rqmode; /* printed by debugfs */
324 send_bast(r, lkb, rqmode); 326 send_bast(r, lkb, rqmode);
325 else 327 } else {
326 dlm_add_ast(lkb, AST_BAST, rqmode); 328 dlm_add_ast(lkb, AST_BAST, rqmode);
329 }
327} 330}
328 331
329/* 332/*
@@ -2280,20 +2283,30 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
2280 if (can_be_queued(lkb)) { 2283 if (can_be_queued(lkb)) {
2281 error = -EINPROGRESS; 2284 error = -EINPROGRESS;
2282 add_lkb(r, lkb, DLM_LKSTS_WAITING); 2285 add_lkb(r, lkb, DLM_LKSTS_WAITING);
2283 send_blocking_asts(r, lkb);
2284 add_timeout(lkb); 2286 add_timeout(lkb);
2285 goto out; 2287 goto out;
2286 } 2288 }
2287 2289
2288 error = -EAGAIN; 2290 error = -EAGAIN;
2289 if (force_blocking_asts(lkb))
2290 send_blocking_asts_all(r, lkb);
2291 queue_cast(r, lkb, -EAGAIN); 2291 queue_cast(r, lkb, -EAGAIN);
2292
2293 out: 2292 out:
2294 return error; 2293 return error;
2295} 2294}
2296 2295
2296static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
2297 int error)
2298{
2299 switch (error) {
2300 case -EAGAIN:
2301 if (force_blocking_asts(lkb))
2302 send_blocking_asts_all(r, lkb);
2303 break;
2304 case -EINPROGRESS:
2305 send_blocking_asts(r, lkb);
2306 break;
2307 }
2308}
2309
2297static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) 2310static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2298{ 2311{
2299 int error = 0; 2312 int error = 0;
@@ -2304,7 +2317,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2304 if (can_be_granted(r, lkb, 1, &deadlk)) { 2317 if (can_be_granted(r, lkb, 1, &deadlk)) {
2305 grant_lock(r, lkb); 2318 grant_lock(r, lkb);
2306 queue_cast(r, lkb, 0); 2319 queue_cast(r, lkb, 0);
2307 grant_pending_locks(r);
2308 goto out; 2320 goto out;
2309 } 2321 }
2310 2322
@@ -2334,7 +2346,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2334 if (_can_be_granted(r, lkb, 1)) { 2346 if (_can_be_granted(r, lkb, 1)) {
2335 grant_lock(r, lkb); 2347 grant_lock(r, lkb);
2336 queue_cast(r, lkb, 0); 2348 queue_cast(r, lkb, 0);
2337 grant_pending_locks(r);
2338 goto out; 2349 goto out;
2339 } 2350 }
2340 /* else fall through and move to convert queue */ 2351 /* else fall through and move to convert queue */
@@ -2344,28 +2355,47 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2344 error = -EINPROGRESS; 2355 error = -EINPROGRESS;
2345 del_lkb(r, lkb); 2356 del_lkb(r, lkb);
2346 add_lkb(r, lkb, DLM_LKSTS_CONVERT); 2357 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
2347 send_blocking_asts(r, lkb);
2348 add_timeout(lkb); 2358 add_timeout(lkb);
2349 goto out; 2359 goto out;
2350 } 2360 }
2351 2361
2352 error = -EAGAIN; 2362 error = -EAGAIN;
2353 if (force_blocking_asts(lkb))
2354 send_blocking_asts_all(r, lkb);
2355 queue_cast(r, lkb, -EAGAIN); 2363 queue_cast(r, lkb, -EAGAIN);
2356
2357 out: 2364 out:
2358 return error; 2365 return error;
2359} 2366}
2360 2367
2368static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
2369 int error)
2370{
2371 switch (error) {
2372 case 0:
2373 grant_pending_locks(r);
2374 /* grant_pending_locks also sends basts */
2375 break;
2376 case -EAGAIN:
2377 if (force_blocking_asts(lkb))
2378 send_blocking_asts_all(r, lkb);
2379 break;
2380 case -EINPROGRESS:
2381 send_blocking_asts(r, lkb);
2382 break;
2383 }
2384}
2385
2361static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) 2386static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2362{ 2387{
2363 remove_lock(r, lkb); 2388 remove_lock(r, lkb);
2364 queue_cast(r, lkb, -DLM_EUNLOCK); 2389 queue_cast(r, lkb, -DLM_EUNLOCK);
2365 grant_pending_locks(r);
2366 return -DLM_EUNLOCK; 2390 return -DLM_EUNLOCK;
2367} 2391}
2368 2392
2393static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
2394 int error)
2395{
2396 grant_pending_locks(r);
2397}
2398
2369/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */ 2399/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
2370 2400
2371static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) 2401static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -2375,12 +2405,18 @@ static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
2375 error = revert_lock(r, lkb); 2405 error = revert_lock(r, lkb);
2376 if (error) { 2406 if (error) {
2377 queue_cast(r, lkb, -DLM_ECANCEL); 2407 queue_cast(r, lkb, -DLM_ECANCEL);
2378 grant_pending_locks(r);
2379 return -DLM_ECANCEL; 2408 return -DLM_ECANCEL;
2380 } 2409 }
2381 return 0; 2410 return 0;
2382} 2411}
2383 2412
2413static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
2414 int error)
2415{
2416 if (error)
2417 grant_pending_locks(r);
2418}
2419
2384/* 2420/*
2385 * Four stage 3 varieties: 2421 * Four stage 3 varieties:
2386 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock() 2422 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
@@ -2402,11 +2438,15 @@ static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2402 goto out; 2438 goto out;
2403 } 2439 }
2404 2440
2405 if (is_remote(r)) 2441 if (is_remote(r)) {
2406 /* receive_request() calls do_request() on remote node */ 2442 /* receive_request() calls do_request() on remote node */
2407 error = send_request(r, lkb); 2443 error = send_request(r, lkb);
2408 else 2444 } else {
2409 error = do_request(r, lkb); 2445 error = do_request(r, lkb);
2446 /* for remote locks the request_reply is sent
2447 between do_request and do_request_effects */
2448 do_request_effects(r, lkb, error);
2449 }
2410 out: 2450 out:
2411 return error; 2451 return error;
2412} 2452}
@@ -2417,11 +2457,15 @@ static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2417{ 2457{
2418 int error; 2458 int error;
2419 2459
2420 if (is_remote(r)) 2460 if (is_remote(r)) {
2421 /* receive_convert() calls do_convert() on remote node */ 2461 /* receive_convert() calls do_convert() on remote node */
2422 error = send_convert(r, lkb); 2462 error = send_convert(r, lkb);
2423 else 2463 } else {
2424 error = do_convert(r, lkb); 2464 error = do_convert(r, lkb);
2465 /* for remote locks the convert_reply is sent
2466 between do_convert and do_convert_effects */
2467 do_convert_effects(r, lkb, error);
2468 }
2425 2469
2426 return error; 2470 return error;
2427} 2471}
@@ -2432,11 +2476,15 @@ static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2432{ 2476{
2433 int error; 2477 int error;
2434 2478
2435 if (is_remote(r)) 2479 if (is_remote(r)) {
2436 /* receive_unlock() calls do_unlock() on remote node */ 2480 /* receive_unlock() calls do_unlock() on remote node */
2437 error = send_unlock(r, lkb); 2481 error = send_unlock(r, lkb);
2438 else 2482 } else {
2439 error = do_unlock(r, lkb); 2483 error = do_unlock(r, lkb);
2484 /* for remote locks the unlock_reply is sent
2485 between do_unlock and do_unlock_effects */
2486 do_unlock_effects(r, lkb, error);
2487 }
2440 2488
2441 return error; 2489 return error;
2442} 2490}
@@ -2447,11 +2495,15 @@ static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2447{ 2495{
2448 int error; 2496 int error;
2449 2497
2450 if (is_remote(r)) 2498 if (is_remote(r)) {
2451 /* receive_cancel() calls do_cancel() on remote node */ 2499 /* receive_cancel() calls do_cancel() on remote node */
2452 error = send_cancel(r, lkb); 2500 error = send_cancel(r, lkb);
2453 else 2501 } else {
2454 error = do_cancel(r, lkb); 2502 error = do_cancel(r, lkb);
2503 /* for remote locks the cancel_reply is sent
2504 between do_cancel and do_cancel_effects */
2505 do_cancel_effects(r, lkb, error);
2506 }
2455 2507
2456 return error; 2508 return error;
2457} 2509}
@@ -3191,6 +3243,7 @@ static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
3191 attach_lkb(r, lkb); 3243 attach_lkb(r, lkb);
3192 error = do_request(r, lkb); 3244 error = do_request(r, lkb);
3193 send_request_reply(r, lkb, error); 3245 send_request_reply(r, lkb, error);
3246 do_request_effects(r, lkb, error);
3194 3247
3195 unlock_rsb(r); 3248 unlock_rsb(r);
3196 put_rsb(r); 3249 put_rsb(r);
@@ -3226,15 +3279,19 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
3226 goto out; 3279 goto out;
3227 3280
3228 receive_flags(lkb, ms); 3281 receive_flags(lkb, ms);
3282
3229 error = receive_convert_args(ls, lkb, ms); 3283 error = receive_convert_args(ls, lkb, ms);
3230 if (error) 3284 if (error) {
3231 goto out_reply; 3285 send_convert_reply(r, lkb, error);
3286 goto out;
3287 }
3288
3232 reply = !down_conversion(lkb); 3289 reply = !down_conversion(lkb);
3233 3290
3234 error = do_convert(r, lkb); 3291 error = do_convert(r, lkb);
3235 out_reply:
3236 if (reply) 3292 if (reply)
3237 send_convert_reply(r, lkb, error); 3293 send_convert_reply(r, lkb, error);
3294 do_convert_effects(r, lkb, error);
3238 out: 3295 out:
3239 unlock_rsb(r); 3296 unlock_rsb(r);
3240 put_rsb(r); 3297 put_rsb(r);
@@ -3266,13 +3323,16 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
3266 goto out; 3323 goto out;
3267 3324
3268 receive_flags(lkb, ms); 3325 receive_flags(lkb, ms);
3326
3269 error = receive_unlock_args(ls, lkb, ms); 3327 error = receive_unlock_args(ls, lkb, ms);
3270 if (error) 3328 if (error) {
3271 goto out_reply; 3329 send_unlock_reply(r, lkb, error);
3330 goto out;
3331 }
3272 3332
3273 error = do_unlock(r, lkb); 3333 error = do_unlock(r, lkb);
3274 out_reply:
3275 send_unlock_reply(r, lkb, error); 3334 send_unlock_reply(r, lkb, error);
3335 do_unlock_effects(r, lkb, error);
3276 out: 3336 out:
3277 unlock_rsb(r); 3337 unlock_rsb(r);
3278 put_rsb(r); 3338 put_rsb(r);
@@ -3307,6 +3367,7 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
3307 3367
3308 error = do_cancel(r, lkb); 3368 error = do_cancel(r, lkb);
3309 send_cancel_reply(r, lkb, error); 3369 send_cancel_reply(r, lkb, error);
3370 do_cancel_effects(r, lkb, error);
3310 out: 3371 out:
3311 unlock_rsb(r); 3372 unlock_rsb(r);
3312 put_rsb(r); 3373 put_rsb(r);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index c010ecfc0d29..f994a7dfda85 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -148,7 +148,7 @@ static void lockspace_kobj_release(struct kobject *k)
148 kfree(ls); 148 kfree(ls);
149} 149}
150 150
151static struct sysfs_ops dlm_attr_ops = { 151static const struct sysfs_ops dlm_attr_ops = {
152 .show = dlm_attr_show, 152 .show = dlm_attr_show,
153 .store = dlm_attr_store, 153 .store = dlm_attr_store,
154}; 154};
@@ -191,6 +191,18 @@ static int do_uevent(struct dlm_ls *ls, int in)
191 return error; 191 return error;
192} 192}
193 193
194static int dlm_uevent(struct kset *kset, struct kobject *kobj,
195 struct kobj_uevent_env *env)
196{
197 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
198
199 add_uevent_var(env, "LOCKSPACE=%s", ls->ls_name);
200 return 0;
201}
202
203static struct kset_uevent_ops dlm_uevent_ops = {
204 .uevent = dlm_uevent,
205};
194 206
195int __init dlm_lockspace_init(void) 207int __init dlm_lockspace_init(void)
196{ 208{
@@ -199,7 +211,7 @@ int __init dlm_lockspace_init(void)
199 INIT_LIST_HEAD(&lslist); 211 INIT_LIST_HEAD(&lslist);
200 spin_lock_init(&lslist_lock); 212 spin_lock_init(&lslist_lock);
201 213
202 dlm_kset = kset_create_and_add("dlm", NULL, kernel_kobj); 214 dlm_kset = kset_create_and_add("dlm", &dlm_uevent_ops, kernel_kobj);
203 if (!dlm_kset) { 215 if (!dlm_kset) {
204 printk(KERN_WARNING "%s: can not create kset\n", __func__); 216 printk(KERN_WARNING "%s: can not create kset\n", __func__);
205 return -ENOMEM; 217 return -ENOMEM;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 52cab160893c..c0d35c620526 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -51,6 +51,7 @@
51#include <linux/file.h> 51#include <linux/file.h>
52#include <linux/mutex.h> 52#include <linux/mutex.h>
53#include <linux/sctp.h> 53#include <linux/sctp.h>
54#include <linux/slab.h>
54#include <net/sctp/user.h> 55#include <net/sctp/user.h>
55#include <net/ipv6.h> 56#include <net/ipv6.h>
56 57
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 84f70bfb0baf..b12532e553f8 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -312,7 +312,7 @@ int dlm_ls_stop(struct dlm_ls *ls)
312 /* 312 /*
313 * This in_recovery lock does two things: 313 * This in_recovery lock does two things:
314 * 1) Keeps this function from returning until all threads are out 314 * 1) Keeps this function from returning until all threads are out
315 * of locking routines and locking is truely stopped. 315 * of locking routines and locking is truly stopped.
316 * 2) Keeps any new requests from being processed until it's unlocked 316 * 2) Keeps any new requests from being processed until it's unlocked
317 * when recovery is complete. 317 * when recovery is complete.
318 */ 318 */
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 052095cd592f..2c6ad518100d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -9,6 +9,7 @@
9#include <net/genetlink.h> 9#include <net/genetlink.h>
10#include <linux/dlm.h> 10#include <linux/dlm.h>
11#include <linux/dlm_netlink.h> 11#include <linux/dlm_netlink.h>
12#include <linux/gfp.h>
12 13
13#include "dlm_internal.h" 14#include "dlm_internal.h"
14 15
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index b5f89aef3b29..d45c02db6943 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -11,6 +11,7 @@
11#include <linux/poll.h> 11#include <linux/poll.h>
12#include <linux/dlm.h> 12#include <linux/dlm.h>
13#include <linux/dlm_plock.h> 13#include <linux/dlm_plock.h>
14#include <linux/slab.h>
14 15
15#include "dlm_internal.h" 16#include "dlm_internal.h"
16#include "lockspace.h" 17#include "lockspace.h"
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index e73a4bb572aa..8b6e73c47435 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. 2 * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This copyrighted material is made available to anyone wishing to use, 4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions 5 * modify, copy, or redistribute it subject to the terms and conditions
@@ -17,6 +17,7 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/dlm.h> 18#include <linux/dlm.h>
19#include <linux/dlm_device.h> 19#include <linux/dlm_device.h>
20#include <linux/slab.h>
20 21
21#include "dlm_internal.h" 22#include "dlm_internal.h"
22#include "lockspace.h" 23#include "lockspace.h"
@@ -173,7 +174,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
173/* we could possibly check if the cancel of an orphan has resulted in the lkb 174/* we could possibly check if the cancel of an orphan has resulted in the lkb
174 being removed and then remove that lkb from the orphans list and free it */ 175 being removed and then remove that lkb from the orphans list and free it */
175 176
176void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode) 177void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
177{ 178{
178 struct dlm_ls *ls; 179 struct dlm_ls *ls;
179 struct dlm_user_args *ua; 180 struct dlm_user_args *ua;
@@ -206,8 +207,10 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
206 207
207 ast_type = lkb->lkb_ast_type; 208 ast_type = lkb->lkb_ast_type;
208 lkb->lkb_ast_type |= type; 209 lkb->lkb_ast_type |= type;
209 if (bastmode) 210 if (type == AST_BAST)
210 lkb->lkb_bastmode = bastmode; 211 lkb->lkb_bastmode = mode;
212 else
213 lkb->lkb_castmode = mode;
211 214
212 if (!ast_type) { 215 if (!ast_type) {
213 kref_get(&lkb->lkb_ref); 216 kref_get(&lkb->lkb_ref);
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 1c9686492286..f196091dd7ff 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. 2 * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This copyrighted material is made available to anyone wishing to use, 4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions 5 * modify, copy, or redistribute it subject to the terms and conditions
@@ -9,7 +9,7 @@
9#ifndef __USER_DOT_H__ 9#ifndef __USER_DOT_H__
10#define __USER_DOT_H__ 10#define __USER_DOT_H__
11 11
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode); 12void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode);
13int dlm_user_init(void); 13int dlm_user_init(void);
14void dlm_user_exit(void); 14void dlm_user_exit(void);
15int dlm_device_deregister(struct dlm_ls *ls); 15int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index fbb6e5eed697..1cc087635a5e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -33,6 +33,7 @@
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/file.h> 34#include <linux/file.h>
35#include <linux/scatterlist.h> 35#include <linux/scatterlist.h>
36#include <linux/slab.h>
36#include <asm/unaligned.h> 37#include <asm/unaligned.h>
37#include "ecryptfs_kernel.h" 38#include "ecryptfs_kernel.h"
38 39
@@ -381,8 +382,8 @@ out:
381static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num, 382static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num,
382 struct ecryptfs_crypt_stat *crypt_stat) 383 struct ecryptfs_crypt_stat *crypt_stat)
383{ 384{
384 (*offset) = (crypt_stat->num_header_bytes_at_front 385 (*offset) = ecryptfs_lower_header_size(crypt_stat)
385 + (crypt_stat->extent_size * extent_num)); 386 + (crypt_stat->extent_size * extent_num);
386} 387}
387 388
388/** 389/**
@@ -834,13 +835,13 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
834 set_extent_mask_and_shift(crypt_stat); 835 set_extent_mask_and_shift(crypt_stat);
835 crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES; 836 crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES;
836 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) 837 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
837 crypt_stat->num_header_bytes_at_front = 0; 838 crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
838 else { 839 else {
839 if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) 840 if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
840 crypt_stat->num_header_bytes_at_front = 841 crypt_stat->metadata_size =
841 ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; 842 ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
842 else 843 else
843 crypt_stat->num_header_bytes_at_front = PAGE_CACHE_SIZE; 844 crypt_stat->metadata_size = PAGE_CACHE_SIZE;
844 } 845 }
845} 846}
846 847
@@ -1107,9 +1108,9 @@ static void write_ecryptfs_marker(char *page_virt, size_t *written)
1107 (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; 1108 (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
1108} 1109}
1109 1110
1110static void 1111void ecryptfs_write_crypt_stat_flags(char *page_virt,
1111write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, 1112 struct ecryptfs_crypt_stat *crypt_stat,
1112 size_t *written) 1113 size_t *written)
1113{ 1114{
1114 u32 flags = 0; 1115 u32 flags = 0;
1115 int i; 1116 int i;
@@ -1237,8 +1238,7 @@ ecryptfs_write_header_metadata(char *virt,
1237 1238
1238 header_extent_size = (u32)crypt_stat->extent_size; 1239 header_extent_size = (u32)crypt_stat->extent_size;
1239 num_header_extents_at_front = 1240 num_header_extents_at_front =
1240 (u16)(crypt_stat->num_header_bytes_at_front 1241 (u16)(crypt_stat->metadata_size / crypt_stat->extent_size);
1241 / crypt_stat->extent_size);
1242 put_unaligned_be32(header_extent_size, virt); 1242 put_unaligned_be32(header_extent_size, virt);
1243 virt += 4; 1243 virt += 4;
1244 put_unaligned_be16(num_header_extents_at_front, virt); 1244 put_unaligned_be16(num_header_extents_at_front, virt);
@@ -1291,7 +1291,8 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
1291 offset = ECRYPTFS_FILE_SIZE_BYTES; 1291 offset = ECRYPTFS_FILE_SIZE_BYTES;
1292 write_ecryptfs_marker((page_virt + offset), &written); 1292 write_ecryptfs_marker((page_virt + offset), &written);
1293 offset += written; 1293 offset += written;
1294 write_ecryptfs_flags((page_virt + offset), crypt_stat, &written); 1294 ecryptfs_write_crypt_stat_flags((page_virt + offset), crypt_stat,
1295 &written);
1295 offset += written; 1296 offset += written;
1296 ecryptfs_write_header_metadata((page_virt + offset), crypt_stat, 1297 ecryptfs_write_header_metadata((page_virt + offset), crypt_stat,
1297 &written); 1298 &written);
@@ -1381,7 +1382,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
1381 rc = -EINVAL; 1382 rc = -EINVAL;
1382 goto out; 1383 goto out;
1383 } 1384 }
1384 virt_len = crypt_stat->num_header_bytes_at_front; 1385 virt_len = crypt_stat->metadata_size;
1385 order = get_order(virt_len); 1386 order = get_order(virt_len);
1386 /* Released in this function */ 1387 /* Released in this function */
1387 virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order); 1388 virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order);
@@ -1427,16 +1428,15 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
1427 header_extent_size = get_unaligned_be32(virt); 1428 header_extent_size = get_unaligned_be32(virt);
1428 virt += sizeof(__be32); 1429 virt += sizeof(__be32);
1429 num_header_extents_at_front = get_unaligned_be16(virt); 1430 num_header_extents_at_front = get_unaligned_be16(virt);
1430 crypt_stat->num_header_bytes_at_front = 1431 crypt_stat->metadata_size = (((size_t)num_header_extents_at_front
1431 (((size_t)num_header_extents_at_front 1432 * (size_t)header_extent_size));
1432 * (size_t)header_extent_size));
1433 (*bytes_read) = (sizeof(__be32) + sizeof(__be16)); 1433 (*bytes_read) = (sizeof(__be32) + sizeof(__be16));
1434 if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE) 1434 if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE)
1435 && (crypt_stat->num_header_bytes_at_front 1435 && (crypt_stat->metadata_size
1436 < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) { 1436 < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) {
1437 rc = -EINVAL; 1437 rc = -EINVAL;
1438 printk(KERN_WARNING "Invalid header size: [%zd]\n", 1438 printk(KERN_WARNING "Invalid header size: [%zd]\n",
1439 crypt_stat->num_header_bytes_at_front); 1439 crypt_stat->metadata_size);
1440 } 1440 }
1441 return rc; 1441 return rc;
1442} 1442}
@@ -1451,8 +1451,7 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
1451 */ 1451 */
1452static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat) 1452static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat)
1453{ 1453{
1454 crypt_stat->num_header_bytes_at_front = 1454 crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
1455 ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
1456} 1455}
1457 1456
1458/** 1457/**
@@ -1606,6 +1605,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
1606 ecryptfs_dentry, 1605 ecryptfs_dentry,
1607 ECRYPTFS_VALIDATE_HEADER_SIZE); 1606 ECRYPTFS_VALIDATE_HEADER_SIZE);
1608 if (rc) { 1607 if (rc) {
1608 memset(page_virt, 0, PAGE_CACHE_SIZE);
1609 rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode); 1609 rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
1610 if (rc) { 1610 if (rc) {
1611 printk(KERN_DEBUG "Valid eCryptfs headers not found in " 1611 printk(KERN_DEBUG "Valid eCryptfs headers not found in "
@@ -1748,7 +1748,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1748 char *cipher_name, size_t *key_size) 1748 char *cipher_name, size_t *key_size)
1749{ 1749{
1750 char dummy_key[ECRYPTFS_MAX_KEY_BYTES]; 1750 char dummy_key[ECRYPTFS_MAX_KEY_BYTES];
1751 char *full_alg_name; 1751 char *full_alg_name = NULL;
1752 int rc; 1752 int rc;
1753 1753
1754 *key_tfm = NULL; 1754 *key_tfm = NULL;
@@ -1763,7 +1763,6 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1763 if (rc) 1763 if (rc)
1764 goto out; 1764 goto out;
1765 *key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC); 1765 *key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC);
1766 kfree(full_alg_name);
1767 if (IS_ERR(*key_tfm)) { 1766 if (IS_ERR(*key_tfm)) {
1768 rc = PTR_ERR(*key_tfm); 1767 rc = PTR_ERR(*key_tfm);
1769 printk(KERN_ERR "Unable to allocate crypto cipher with name " 1768 printk(KERN_ERR "Unable to allocate crypto cipher with name "
@@ -1786,6 +1785,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1786 goto out; 1785 goto out;
1787 } 1786 }
1788out: 1787out:
1788 kfree(full_alg_name);
1789 return rc; 1789 return rc;
1790} 1790}
1791 1791
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 8f006a0d6076..906e803f7f79 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -26,6 +26,7 @@
26#include <linux/namei.h> 26#include <linux/namei.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/fs_stack.h> 28#include <linux/fs_stack.h>
29#include <linux/slab.h>
29#include "ecryptfs_kernel.h" 30#include "ecryptfs_kernel.h"
30 31
31/** 32/**
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 542f625312f3..bfc2e0f78f00 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -35,6 +35,7 @@
35#include <linux/scatterlist.h> 35#include <linux/scatterlist.h>
36#include <linux/hash.h> 36#include <linux/hash.h>
37#include <linux/nsproxy.h> 37#include <linux/nsproxy.h>
38#include <linux/backing-dev.h>
38 39
39/* Version verification for shared data structures w/ userspace */ 40/* Version verification for shared data structures w/ userspace */
40#define ECRYPTFS_VERSION_MAJOR 0x00 41#define ECRYPTFS_VERSION_MAJOR 0x00
@@ -273,7 +274,7 @@ struct ecryptfs_crypt_stat {
273 u32 flags; 274 u32 flags;
274 unsigned int file_version; 275 unsigned int file_version;
275 size_t iv_bytes; 276 size_t iv_bytes;
276 size_t num_header_bytes_at_front; 277 size_t metadata_size;
277 size_t extent_size; /* Data extent size; default is 4096 */ 278 size_t extent_size; /* Data extent size; default is 4096 */
278 size_t key_size; 279 size_t key_size;
279 size_t extent_shift; 280 size_t extent_shift;
@@ -393,6 +394,7 @@ struct ecryptfs_mount_crypt_stat {
393struct ecryptfs_sb_info { 394struct ecryptfs_sb_info {
394 struct super_block *wsi_sb; 395 struct super_block *wsi_sb;
395 struct ecryptfs_mount_crypt_stat mount_crypt_stat; 396 struct ecryptfs_mount_crypt_stat mount_crypt_stat;
397 struct backing_dev_info bdi;
396}; 398};
397 399
398/* file private data. */ 400/* file private data. */
@@ -464,6 +466,14 @@ struct ecryptfs_daemon {
464 466
465extern struct mutex ecryptfs_daemon_hash_mux; 467extern struct mutex ecryptfs_daemon_hash_mux;
466 468
469static inline size_t
470ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
471{
472 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
473 return 0;
474 return crypt_stat->metadata_size;
475}
476
467static inline struct ecryptfs_file_info * 477static inline struct ecryptfs_file_info *
468ecryptfs_file_to_private(struct file *file) 478ecryptfs_file_to_private(struct file *file)
469{ 479{
@@ -651,6 +661,9 @@ int ecryptfs_decrypt_page(struct page *page);
651int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry); 661int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry);
652int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); 662int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
653int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); 663int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
664void ecryptfs_write_crypt_stat_flags(char *page_virt,
665 struct ecryptfs_crypt_stat *crypt_stat,
666 size_t *written);
654int ecryptfs_read_and_validate_header_region(char *data, 667int ecryptfs_read_and_validate_header_region(char *data,
655 struct inode *ecryptfs_inode); 668 struct inode *ecryptfs_inode);
656int ecryptfs_read_and_validate_xattr_region(char *page_virt, 669int ecryptfs_read_and_validate_xattr_region(char *page_virt,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 9e944057001b..e7440a6f5ebf 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -25,6 +25,7 @@
25 25
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/poll.h> 27#include <linux/poll.h>
28#include <linux/slab.h>
28#include <linux/mount.h> 29#include <linux/mount.h>
29#include <linux/pagemap.h> 30#include <linux/pagemap.h>
30#include <linux/security.h> 31#include <linux/security.h>
@@ -158,7 +159,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
158 struct dentry *ecryptfs_dentry = file->f_path.dentry; 159 struct dentry *ecryptfs_dentry = file->f_path.dentry;
159 /* Private value of ecryptfs_dentry allocated in 160 /* Private value of ecryptfs_dentry allocated in
160 * ecryptfs_lookup() */ 161 * ecryptfs_lookup() */
161 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); 162 struct dentry *lower_dentry;
162 struct ecryptfs_file_info *file_info; 163 struct ecryptfs_file_info *file_info;
163 164
164 mount_crypt_stat = &ecryptfs_superblock_to_private( 165 mount_crypt_stat = &ecryptfs_superblock_to_private(
@@ -191,13 +192,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
191 | ECRYPTFS_ENCRYPTED); 192 | ECRYPTFS_ENCRYPTED);
192 } 193 }
193 mutex_unlock(&crypt_stat->cs_mutex); 194 mutex_unlock(&crypt_stat->cs_mutex);
194 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
195 && !(file->f_flags & O_RDONLY)) {
196 rc = -EPERM;
197 printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
198 "file must hence be opened RO\n", __func__);
199 goto out;
200 }
201 if (!ecryptfs_inode_to_private(inode)->lower_file) { 195 if (!ecryptfs_inode_to_private(inode)->lower_file) {
202 rc = ecryptfs_init_persistent_file(ecryptfs_dentry); 196 rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
203 if (rc) { 197 if (rc) {
@@ -208,6 +202,13 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
208 goto out; 202 goto out;
209 } 203 }
210 } 204 }
205 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
206 && !(file->f_flags & O_RDONLY)) {
207 rc = -EPERM;
208 printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
209 "file must hence be opened RO\n", __func__);
210 goto out;
211 }
211 ecryptfs_set_file_lower( 212 ecryptfs_set_file_lower(
212 file, ecryptfs_inode_to_private(inode)->lower_file); 213 file, ecryptfs_inode_to_private(inode)->lower_file);
213 if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { 214 if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
@@ -299,7 +300,6 @@ static int ecryptfs_ioctl(struct inode *inode, struct file *file,
299const struct file_operations ecryptfs_dir_fops = { 300const struct file_operations ecryptfs_dir_fops = {
300 .readdir = ecryptfs_readdir, 301 .readdir = ecryptfs_readdir,
301 .ioctl = ecryptfs_ioctl, 302 .ioctl = ecryptfs_ioctl,
302 .mmap = generic_file_mmap,
303 .open = ecryptfs_open, 303 .open = ecryptfs_open,
304 .flush = ecryptfs_flush, 304 .flush = ecryptfs_flush,
305 .release = ecryptfs_release, 305 .release = ecryptfs_release,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 429ca0b3ba08..e2d4418affac 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -31,6 +31,7 @@
31#include <linux/mount.h> 31#include <linux/mount.h>
32#include <linux/crypto.h> 32#include <linux/crypto.h>
33#include <linux/fs_stack.h> 33#include <linux/fs_stack.h>
34#include <linux/slab.h>
34#include <asm/unaligned.h> 35#include <asm/unaligned.h>
35#include "ecryptfs_kernel.h" 36#include "ecryptfs_kernel.h"
36 37
@@ -282,7 +283,8 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
282 goto out; 283 goto out;
283 } 284 }
284 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, 285 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
285 ecryptfs_dir_inode->i_sb, 1); 286 ecryptfs_dir_inode->i_sb,
287 ECRYPTFS_INTERPOSE_FLAG_D_ADD);
286 if (rc) { 288 if (rc) {
287 printk(KERN_ERR "%s: Error interposing; rc = [%d]\n", 289 printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
288 __func__, rc); 290 __func__, rc);
@@ -322,6 +324,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
322 rc = ecryptfs_read_and_validate_header_region(page_virt, 324 rc = ecryptfs_read_and_validate_header_region(page_virt,
323 ecryptfs_dentry->d_inode); 325 ecryptfs_dentry->d_inode);
324 if (rc) { 326 if (rc) {
327 memset(page_virt, 0, PAGE_CACHE_SIZE);
325 rc = ecryptfs_read_and_validate_xattr_region(page_virt, 328 rc = ecryptfs_read_and_validate_xattr_region(page_virt,
326 ecryptfs_dentry); 329 ecryptfs_dentry);
327 if (rc) { 330 if (rc) {
@@ -334,7 +337,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
334 ecryptfs_dentry->d_sb)->mount_crypt_stat; 337 ecryptfs_dentry->d_sb)->mount_crypt_stat;
335 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { 338 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
336 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) 339 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
337 file_size = (crypt_stat->num_header_bytes_at_front 340 file_size = (crypt_stat->metadata_size
338 + i_size_read(lower_dentry->d_inode)); 341 + i_size_read(lower_dentry->d_inode));
339 else 342 else
340 file_size = i_size_read(lower_dentry->d_inode); 343 file_size = i_size_read(lower_dentry->d_inode);
@@ -386,9 +389,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
386 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); 389 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
387 if (IS_ERR(lower_dentry)) { 390 if (IS_ERR(lower_dentry)) {
388 rc = PTR_ERR(lower_dentry); 391 rc = PTR_ERR(lower_dentry);
389 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " 392 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
390 "lower_dentry = [%s]\n", __func__, rc, 393 "[%d] on lower_dentry = [%s]\n", __func__, rc,
391 ecryptfs_dentry->d_name.name); 394 encrypted_and_encoded_name);
392 goto out_d_drop; 395 goto out_d_drop;
393 } 396 }
394 if (lower_dentry->d_inode) 397 if (lower_dentry->d_inode)
@@ -415,9 +418,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
415 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); 418 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
416 if (IS_ERR(lower_dentry)) { 419 if (IS_ERR(lower_dentry)) {
417 rc = PTR_ERR(lower_dentry); 420 rc = PTR_ERR(lower_dentry);
418 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " 421 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
419 "lower_dentry = [%s]\n", __func__, rc, 422 "[%d] on lower_dentry = [%s]\n", __func__, rc,
420 encrypted_and_encoded_name); 423 encrypted_and_encoded_name);
421 goto out_d_drop; 424 goto out_d_drop;
422 } 425 }
423lookup_and_interpose: 426lookup_and_interpose:
@@ -454,8 +457,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
454 rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0); 457 rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0);
455 if (rc) 458 if (rc)
456 goto out_lock; 459 goto out_lock;
457 fsstack_copy_attr_times(dir, lower_new_dentry->d_inode); 460 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
458 fsstack_copy_inode_size(dir, lower_new_dentry->d_inode); 461 fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
459 old_dentry->d_inode->i_nlink = 462 old_dentry->d_inode->i_nlink =
460 ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink; 463 ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink;
461 i_size_write(new_dentry->d_inode, file_size_save); 464 i_size_write(new_dentry->d_inode, file_size_save);
@@ -463,9 +466,6 @@ out_lock:
463 unlock_dir(lower_dir_dentry); 466 unlock_dir(lower_dir_dentry);
464 dput(lower_new_dentry); 467 dput(lower_new_dentry);
465 dput(lower_old_dentry); 468 dput(lower_old_dentry);
466 d_drop(lower_old_dentry);
467 d_drop(new_dentry);
468 d_drop(old_dentry);
469 return rc; 469 return rc;
470} 470}
471 471
@@ -614,6 +614,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
614 struct dentry *lower_new_dentry; 614 struct dentry *lower_new_dentry;
615 struct dentry *lower_old_dir_dentry; 615 struct dentry *lower_old_dir_dentry;
616 struct dentry *lower_new_dir_dentry; 616 struct dentry *lower_new_dir_dentry;
617 struct dentry *trap = NULL;
617 618
618 lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); 619 lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
619 lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); 620 lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
@@ -621,7 +622,17 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
621 dget(lower_new_dentry); 622 dget(lower_new_dentry);
622 lower_old_dir_dentry = dget_parent(lower_old_dentry); 623 lower_old_dir_dentry = dget_parent(lower_old_dentry);
623 lower_new_dir_dentry = dget_parent(lower_new_dentry); 624 lower_new_dir_dentry = dget_parent(lower_new_dentry);
624 lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); 625 trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
626 /* source should not be ancestor of target */
627 if (trap == lower_old_dentry) {
628 rc = -EINVAL;
629 goto out_lock;
630 }
631 /* target should not be ancestor of source */
632 if (trap == lower_new_dentry) {
633 rc = -ENOTEMPTY;
634 goto out_lock;
635 }
625 rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, 636 rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
626 lower_new_dir_dentry->d_inode, lower_new_dentry); 637 lower_new_dir_dentry->d_inode, lower_new_dentry);
627 if (rc) 638 if (rc)
@@ -638,38 +649,17 @@ out_lock:
638 return rc; 649 return rc;
639} 650}
640 651
641static int 652static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
642ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) 653 size_t *bufsiz)
643{ 654{
655 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
644 char *lower_buf; 656 char *lower_buf;
645 size_t lower_bufsiz; 657 size_t lower_bufsiz = PATH_MAX;
646 struct dentry *lower_dentry;
647 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
648 char *plaintext_name;
649 size_t plaintext_name_size;
650 mm_segment_t old_fs; 658 mm_segment_t old_fs;
651 int rc; 659 int rc;
652 660
653 lower_dentry = ecryptfs_dentry_to_lower(dentry);
654 if (!lower_dentry->d_inode->i_op->readlink) {
655 rc = -EINVAL;
656 goto out;
657 }
658 mount_crypt_stat = &ecryptfs_superblock_to_private(
659 dentry->d_sb)->mount_crypt_stat;
660 /*
661 * If the lower filename is encrypted, it will result in a significantly
662 * longer name. If needed, truncate the name after decode and decrypt.
663 */
664 if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
665 lower_bufsiz = PATH_MAX;
666 else
667 lower_bufsiz = bufsiz;
668 /* Released in this function */
669 lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL); 661 lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
670 if (lower_buf == NULL) { 662 if (!lower_buf) {
671 printk(KERN_ERR "%s: Out of memory whilst attempting to "
672 "kmalloc [%zd] bytes\n", __func__, lower_bufsiz);
673 rc = -ENOMEM; 663 rc = -ENOMEM;
674 goto out; 664 goto out;
675 } 665 }
@@ -679,29 +669,31 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
679 (char __user *)lower_buf, 669 (char __user *)lower_buf,
680 lower_bufsiz); 670 lower_bufsiz);
681 set_fs(old_fs); 671 set_fs(old_fs);
682 if (rc >= 0) { 672 if (rc < 0)
683 rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name, 673 goto out;
684 &plaintext_name_size, 674 lower_bufsiz = rc;
685 dentry, lower_buf, 675 rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
686 rc); 676 lower_buf, lower_bufsiz);
687 if (rc) { 677out:
688 printk(KERN_ERR "%s: Error attempting to decode and "
689 "decrypt filename; rc = [%d]\n", __func__,
690 rc);
691 goto out_free_lower_buf;
692 }
693 /* Check for bufsiz <= 0 done in sys_readlinkat() */
694 rc = copy_to_user(buf, plaintext_name,
695 min((size_t) bufsiz, plaintext_name_size));
696 if (rc)
697 rc = -EFAULT;
698 else
699 rc = plaintext_name_size;
700 kfree(plaintext_name);
701 fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
702 }
703out_free_lower_buf:
704 kfree(lower_buf); 678 kfree(lower_buf);
679 return rc;
680}
681
682static int
683ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
684{
685 char *kbuf;
686 size_t kbufsiz, copied;
687 int rc;
688
689 rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz);
690 if (rc)
691 goto out;
692 copied = min_t(size_t, bufsiz, kbufsiz);
693 rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied;
694 kfree(kbuf);
695 fsstack_copy_attr_atime(dentry->d_inode,
696 ecryptfs_dentry_to_lower(dentry)->d_inode);
705out: 697out:
706 return rc; 698 return rc;
707} 699}
@@ -715,31 +707,31 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
715 /* Released in ecryptfs_put_link(); only release here on error */ 707 /* Released in ecryptfs_put_link(); only release here on error */
716 buf = kmalloc(len, GFP_KERNEL); 708 buf = kmalloc(len, GFP_KERNEL);
717 if (!buf) { 709 if (!buf) {
718 rc = -ENOMEM; 710 buf = ERR_PTR(-ENOMEM);
719 goto out; 711 goto out;
720 } 712 }
721 old_fs = get_fs(); 713 old_fs = get_fs();
722 set_fs(get_ds()); 714 set_fs(get_ds());
723 rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); 715 rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
724 set_fs(old_fs); 716 set_fs(old_fs);
725 if (rc < 0) 717 if (rc < 0) {
726 goto out_free; 718 kfree(buf);
727 else 719 buf = ERR_PTR(rc);
720 } else
728 buf[rc] = '\0'; 721 buf[rc] = '\0';
729 rc = 0;
730 nd_set_link(nd, buf);
731 goto out;
732out_free:
733 kfree(buf);
734out: 722out:
735 return ERR_PTR(rc); 723 nd_set_link(nd, buf);
724 return NULL;
736} 725}
737 726
738static void 727static void
739ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr) 728ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr)
740{ 729{
741 /* Free the char* */ 730 char *buf = nd_get_link(nd);
742 kfree(nd_get_link(nd)); 731 if (!IS_ERR(buf)) {
732 /* Free the char* */
733 kfree(buf);
734 }
743} 735}
744 736
745/** 737/**
@@ -759,7 +751,7 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
759{ 751{
760 loff_t lower_size; 752 loff_t lower_size;
761 753
762 lower_size = crypt_stat->num_header_bytes_at_front; 754 lower_size = ecryptfs_lower_header_size(crypt_stat);
763 if (upper_size != 0) { 755 if (upper_size != 0) {
764 loff_t num_extents; 756 loff_t num_extents;
765 757
@@ -772,18 +764,23 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
772} 764}
773 765
774/** 766/**
775 * ecryptfs_truncate 767 * truncate_upper
776 * @dentry: The ecryptfs layer dentry 768 * @dentry: The ecryptfs layer dentry
777 * @new_length: The length to expand the file to 769 * @ia: Address of the ecryptfs inode's attributes
770 * @lower_ia: Address of the lower inode's attributes
778 * 771 *
779 * Function to handle truncations modifying the size of the file. Note 772 * Function to handle truncations modifying the size of the file. Note
780 * that the file sizes are interpolated. When expanding, we are simply 773 * that the file sizes are interpolated. When expanding, we are simply
781 * writing strings of 0's out. When truncating, we need to modify the 774 * writing strings of 0's out. When truncating, we truncate the upper
782 * underlying file size according to the page index interpolations. 775 * inode and update the lower_ia according to the page index
776 * interpolations. If ATTR_SIZE is set in lower_ia->ia_valid upon return,
777 * the caller must use lower_ia in a call to notify_change() to perform
778 * the truncation of the lower inode.
783 * 779 *
784 * Returns zero on success; non-zero otherwise 780 * Returns zero on success; non-zero otherwise
785 */ 781 */
786int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) 782static int truncate_upper(struct dentry *dentry, struct iattr *ia,
783 struct iattr *lower_ia)
787{ 784{
788 int rc = 0; 785 int rc = 0;
789 struct inode *inode = dentry->d_inode; 786 struct inode *inode = dentry->d_inode;
@@ -794,8 +791,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
794 loff_t lower_size_before_truncate; 791 loff_t lower_size_before_truncate;
795 loff_t lower_size_after_truncate; 792 loff_t lower_size_after_truncate;
796 793
797 if (unlikely((new_length == i_size))) 794 if (unlikely((ia->ia_size == i_size))) {
795 lower_ia->ia_valid &= ~ATTR_SIZE;
798 goto out; 796 goto out;
797 }
799 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; 798 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
800 /* Set up a fake ecryptfs file, this is used to interface with 799 /* Set up a fake ecryptfs file, this is used to interface with
801 * the file in the underlying filesystem so that the 800 * the file in the underlying filesystem so that the
@@ -815,28 +814,30 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
815 &fake_ecryptfs_file, 814 &fake_ecryptfs_file,
816 ecryptfs_inode_to_private(dentry->d_inode)->lower_file); 815 ecryptfs_inode_to_private(dentry->d_inode)->lower_file);
817 /* Switch on growing or shrinking file */ 816 /* Switch on growing or shrinking file */
818 if (new_length > i_size) { 817 if (ia->ia_size > i_size) {
819 char zero[] = { 0x00 }; 818 char zero[] = { 0x00 };
820 819
820 lower_ia->ia_valid &= ~ATTR_SIZE;
821 /* Write a single 0 at the last position of the file; 821 /* Write a single 0 at the last position of the file;
822 * this triggers code that will fill in 0's throughout 822 * this triggers code that will fill in 0's throughout
823 * the intermediate portion of the previous end of the 823 * the intermediate portion of the previous end of the
824 * file and the new and of the file */ 824 * file and the new and of the file */
825 rc = ecryptfs_write(&fake_ecryptfs_file, zero, 825 rc = ecryptfs_write(&fake_ecryptfs_file, zero,
826 (new_length - 1), 1); 826 (ia->ia_size - 1), 1);
827 } else { /* new_length < i_size_read(inode) */ 827 } else { /* ia->ia_size < i_size_read(inode) */
828 /* We're chopping off all the pages down do the page 828 /* We're chopping off all the pages down to the page
829 * in which new_length is located. Fill in the end of 829 * in which ia->ia_size is located. Fill in the end of
830 * that page from (new_length & ~PAGE_CACHE_MASK) to 830 * that page from (ia->ia_size & ~PAGE_CACHE_MASK) to
831 * PAGE_CACHE_SIZE with zeros. */ 831 * PAGE_CACHE_SIZE with zeros. */
832 size_t num_zeros = (PAGE_CACHE_SIZE 832 size_t num_zeros = (PAGE_CACHE_SIZE
833 - (new_length & ~PAGE_CACHE_MASK)); 833 - (ia->ia_size & ~PAGE_CACHE_MASK));
834 834
835 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 835 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
836 rc = vmtruncate(inode, new_length); 836 rc = vmtruncate(inode, ia->ia_size);
837 if (rc) 837 if (rc)
838 goto out_free; 838 goto out_free;
839 rc = vmtruncate(lower_dentry->d_inode, new_length); 839 lower_ia->ia_size = ia->ia_size;
840 lower_ia->ia_valid |= ATTR_SIZE;
840 goto out_free; 841 goto out_free;
841 } 842 }
842 if (num_zeros) { 843 if (num_zeros) {
@@ -848,7 +849,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
848 goto out_free; 849 goto out_free;
849 } 850 }
850 rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt, 851 rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt,
851 new_length, num_zeros); 852 ia->ia_size, num_zeros);
852 kfree(zeros_virt); 853 kfree(zeros_virt);
853 if (rc) { 854 if (rc) {
854 printk(KERN_ERR "Error attempting to zero out " 855 printk(KERN_ERR "Error attempting to zero out "
@@ -857,7 +858,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
857 goto out_free; 858 goto out_free;
858 } 859 }
859 } 860 }
860 vmtruncate(inode, new_length); 861 vmtruncate(inode, ia->ia_size);
861 rc = ecryptfs_write_inode_size_to_metadata(inode); 862 rc = ecryptfs_write_inode_size_to_metadata(inode);
862 if (rc) { 863 if (rc) {
863 printk(KERN_ERR "Problem with " 864 printk(KERN_ERR "Problem with "
@@ -870,10 +871,12 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
870 lower_size_before_truncate = 871 lower_size_before_truncate =
871 upper_size_to_lower_size(crypt_stat, i_size); 872 upper_size_to_lower_size(crypt_stat, i_size);
872 lower_size_after_truncate = 873 lower_size_after_truncate =
873 upper_size_to_lower_size(crypt_stat, new_length); 874 upper_size_to_lower_size(crypt_stat, ia->ia_size);
874 if (lower_size_after_truncate < lower_size_before_truncate) 875 if (lower_size_after_truncate < lower_size_before_truncate) {
875 vmtruncate(lower_dentry->d_inode, 876 lower_ia->ia_size = lower_size_after_truncate;
876 lower_size_after_truncate); 877 lower_ia->ia_valid |= ATTR_SIZE;
878 } else
879 lower_ia->ia_valid &= ~ATTR_SIZE;
877 } 880 }
878out_free: 881out_free:
879 if (ecryptfs_file_to_private(&fake_ecryptfs_file)) 882 if (ecryptfs_file_to_private(&fake_ecryptfs_file))
@@ -883,6 +886,33 @@ out:
883 return rc; 886 return rc;
884} 887}
885 888
889/**
890 * ecryptfs_truncate
891 * @dentry: The ecryptfs layer dentry
892 * @new_length: The length to expand the file to
893 *
894 * Simple function that handles the truncation of an eCryptfs inode and
895 * its corresponding lower inode.
896 *
897 * Returns zero on success; non-zero otherwise
898 */
899int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
900{
901 struct iattr ia = { .ia_valid = ATTR_SIZE, .ia_size = new_length };
902 struct iattr lower_ia = { .ia_valid = 0 };
903 int rc;
904
905 rc = truncate_upper(dentry, &ia, &lower_ia);
906 if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
907 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
908
909 mutex_lock(&lower_dentry->d_inode->i_mutex);
910 rc = notify_change(lower_dentry, &lower_ia);
911 mutex_unlock(&lower_dentry->d_inode->i_mutex);
912 }
913 return rc;
914}
915
886static int 916static int
887ecryptfs_permission(struct inode *inode, int mask) 917ecryptfs_permission(struct inode *inode, int mask)
888{ 918{
@@ -905,6 +935,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
905{ 935{
906 int rc = 0; 936 int rc = 0;
907 struct dentry *lower_dentry; 937 struct dentry *lower_dentry;
938 struct iattr lower_ia;
908 struct inode *inode; 939 struct inode *inode;
909 struct inode *lower_inode; 940 struct inode *lower_inode;
910 struct ecryptfs_crypt_stat *crypt_stat; 941 struct ecryptfs_crypt_stat *crypt_stat;
@@ -943,15 +974,11 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
943 } 974 }
944 } 975 }
945 mutex_unlock(&crypt_stat->cs_mutex); 976 mutex_unlock(&crypt_stat->cs_mutex);
977 memcpy(&lower_ia, ia, sizeof(lower_ia));
978 if (ia->ia_valid & ATTR_FILE)
979 lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file);
946 if (ia->ia_valid & ATTR_SIZE) { 980 if (ia->ia_valid & ATTR_SIZE) {
947 ecryptfs_printk(KERN_DEBUG, 981 rc = truncate_upper(dentry, ia, &lower_ia);
948 "ia->ia_valid = [0x%x] ATTR_SIZE" " = [0x%x]\n",
949 ia->ia_valid, ATTR_SIZE);
950 rc = ecryptfs_truncate(dentry, ia->ia_size);
951 /* ecryptfs_truncate handles resizing of the lower file */
952 ia->ia_valid &= ~ATTR_SIZE;
953 ecryptfs_printk(KERN_DEBUG, "ia->ia_valid = [%x]\n",
954 ia->ia_valid);
955 if (rc < 0) 982 if (rc < 0)
956 goto out; 983 goto out;
957 } 984 }
@@ -960,17 +987,54 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
960 * mode change is for clearing setuid/setgid bits. Allow lower fs 987 * mode change is for clearing setuid/setgid bits. Allow lower fs
961 * to interpret this in its own way. 988 * to interpret this in its own way.
962 */ 989 */
963 if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) 990 if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
964 ia->ia_valid &= ~ATTR_MODE; 991 lower_ia.ia_valid &= ~ATTR_MODE;
965 992
966 mutex_lock(&lower_dentry->d_inode->i_mutex); 993 mutex_lock(&lower_dentry->d_inode->i_mutex);
967 rc = notify_change(lower_dentry, ia); 994 rc = notify_change(lower_dentry, &lower_ia);
968 mutex_unlock(&lower_dentry->d_inode->i_mutex); 995 mutex_unlock(&lower_dentry->d_inode->i_mutex);
969out: 996out:
970 fsstack_copy_attr_all(inode, lower_inode); 997 fsstack_copy_attr_all(inode, lower_inode);
971 return rc; 998 return rc;
972} 999}
973 1000
1001int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
1002 struct kstat *stat)
1003{
1004 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
1005 int rc = 0;
1006
1007 mount_crypt_stat = &ecryptfs_superblock_to_private(
1008 dentry->d_sb)->mount_crypt_stat;
1009 generic_fillattr(dentry->d_inode, stat);
1010 if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
1011 char *target;
1012 size_t targetsiz;
1013
1014 rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz);
1015 if (!rc) {
1016 kfree(target);
1017 stat->size = targetsiz;
1018 }
1019 }
1020 return rc;
1021}
1022
1023int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1024 struct kstat *stat)
1025{
1026 struct kstat lower_stat;
1027 int rc;
1028
1029 rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry),
1030 ecryptfs_dentry_to_lower(dentry), &lower_stat);
1031 if (!rc) {
1032 generic_fillattr(dentry->d_inode, stat);
1033 stat->blocks = lower_stat.blocks;
1034 }
1035 return rc;
1036}
1037
974int 1038int
975ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, 1039ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
976 size_t size, int flags) 1040 size_t size, int flags)
@@ -980,7 +1044,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
980 1044
981 lower_dentry = ecryptfs_dentry_to_lower(dentry); 1045 lower_dentry = ecryptfs_dentry_to_lower(dentry);
982 if (!lower_dentry->d_inode->i_op->setxattr) { 1046 if (!lower_dentry->d_inode->i_op->setxattr) {
983 rc = -ENOSYS; 1047 rc = -EOPNOTSUPP;
984 goto out; 1048 goto out;
985 } 1049 }
986 mutex_lock(&lower_dentry->d_inode->i_mutex); 1050 mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -998,7 +1062,7 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
998 int rc = 0; 1062 int rc = 0;
999 1063
1000 if (!lower_dentry->d_inode->i_op->getxattr) { 1064 if (!lower_dentry->d_inode->i_op->getxattr) {
1001 rc = -ENOSYS; 1065 rc = -EOPNOTSUPP;
1002 goto out; 1066 goto out;
1003 } 1067 }
1004 mutex_lock(&lower_dentry->d_inode->i_mutex); 1068 mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1025,7 +1089,7 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
1025 1089
1026 lower_dentry = ecryptfs_dentry_to_lower(dentry); 1090 lower_dentry = ecryptfs_dentry_to_lower(dentry);
1027 if (!lower_dentry->d_inode->i_op->listxattr) { 1091 if (!lower_dentry->d_inode->i_op->listxattr) {
1028 rc = -ENOSYS; 1092 rc = -EOPNOTSUPP;
1029 goto out; 1093 goto out;
1030 } 1094 }
1031 mutex_lock(&lower_dentry->d_inode->i_mutex); 1095 mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1042,7 +1106,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
1042 1106
1043 lower_dentry = ecryptfs_dentry_to_lower(dentry); 1107 lower_dentry = ecryptfs_dentry_to_lower(dentry);
1044 if (!lower_dentry->d_inode->i_op->removexattr) { 1108 if (!lower_dentry->d_inode->i_op->removexattr) {
1045 rc = -ENOSYS; 1109 rc = -EOPNOTSUPP;
1046 goto out; 1110 goto out;
1047 } 1111 }
1048 mutex_lock(&lower_dentry->d_inode->i_mutex); 1112 mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1073,6 +1137,7 @@ const struct inode_operations ecryptfs_symlink_iops = {
1073 .put_link = ecryptfs_put_link, 1137 .put_link = ecryptfs_put_link,
1074 .permission = ecryptfs_permission, 1138 .permission = ecryptfs_permission,
1075 .setattr = ecryptfs_setattr, 1139 .setattr = ecryptfs_setattr,
1140 .getattr = ecryptfs_getattr_link,
1076 .setxattr = ecryptfs_setxattr, 1141 .setxattr = ecryptfs_setxattr,
1077 .getxattr = ecryptfs_getxattr, 1142 .getxattr = ecryptfs_getxattr,
1078 .listxattr = ecryptfs_listxattr, 1143 .listxattr = ecryptfs_listxattr,
@@ -1100,6 +1165,7 @@ const struct inode_operations ecryptfs_dir_iops = {
1100const struct inode_operations ecryptfs_main_iops = { 1165const struct inode_operations ecryptfs_main_iops = {
1101 .permission = ecryptfs_permission, 1166 .permission = ecryptfs_permission,
1102 .setattr = ecryptfs_setattr, 1167 .setattr = ecryptfs_setattr,
1168 .getattr = ecryptfs_getattr,
1103 .setxattr = ecryptfs_setxattr, 1169 .setxattr = ecryptfs_setxattr,
1104 .getxattr = ecryptfs_getxattr, 1170 .getxattr = ecryptfs_getxattr,
1105 .listxattr = ecryptfs_listxattr, 1171 .listxattr = ecryptfs_listxattr,
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index a0a7847567e9..89c5476506ef 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -32,6 +32,7 @@
32#include <linux/random.h> 32#include <linux/random.h>
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/scatterlist.h> 34#include <linux/scatterlist.h>
35#include <linux/slab.h>
35#include "ecryptfs_kernel.h" 36#include "ecryptfs_kernel.h"
36 37
37/** 38/**
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index e14cf7e588db..d8c3a373aafa 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/slab.h>
25#include <linux/wait.h> 26#include <linux/wait.h>
26#include <linux/mount.h> 27#include <linux/mount.h>
27#include "ecryptfs_kernel.h" 28#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 567bc4b9f70a..760983d0f25e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,6 +35,7 @@
35#include <linux/key.h> 35#include <linux/key.h>
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/fs_stack.h> 37#include <linux/fs_stack.h>
38#include <linux/slab.h>
38#include "ecryptfs_kernel.h" 39#include "ecryptfs_kernel.h"
39 40
40/** 41/**
@@ -496,17 +497,25 @@ struct kmem_cache *ecryptfs_sb_info_cache;
496static int 497static int
497ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent) 498ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
498{ 499{
500 struct ecryptfs_sb_info *esi;
499 int rc = 0; 501 int rc = 0;
500 502
501 /* Released in ecryptfs_put_super() */ 503 /* Released in ecryptfs_put_super() */
502 ecryptfs_set_superblock_private(sb, 504 ecryptfs_set_superblock_private(sb,
503 kmem_cache_zalloc(ecryptfs_sb_info_cache, 505 kmem_cache_zalloc(ecryptfs_sb_info_cache,
504 GFP_KERNEL)); 506 GFP_KERNEL));
505 if (!ecryptfs_superblock_to_private(sb)) { 507 esi = ecryptfs_superblock_to_private(sb);
508 if (!esi) {
506 ecryptfs_printk(KERN_WARNING, "Out of memory\n"); 509 ecryptfs_printk(KERN_WARNING, "Out of memory\n");
507 rc = -ENOMEM; 510 rc = -ENOMEM;
508 goto out; 511 goto out;
509 } 512 }
513
514 rc = bdi_setup_and_register(&esi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
515 if (rc)
516 goto out;
517
518 sb->s_bdi = &esi->bdi;
510 sb->s_op = &ecryptfs_sops; 519 sb->s_op = &ecryptfs_sops;
511 /* Released through deactivate_super(sb) from get_sb_nodev */ 520 /* Released through deactivate_super(sb) from get_sb_nodev */
512 sb->s_root = d_alloc(NULL, &(const struct qstr) { 521 sb->s_root = d_alloc(NULL, &(const struct qstr) {
@@ -585,8 +594,8 @@ out:
585 * with as much information as it can before needing 594 * with as much information as it can before needing
586 * the lower filesystem. 595 * the lower filesystem.
587 * ecryptfs_read_super(): this accesses the lower filesystem and uses 596 * ecryptfs_read_super(): this accesses the lower filesystem and uses
588 * ecryptfs_interpolate to perform most of the linking 597 * ecryptfs_interpose to perform most of the linking
589 * ecryptfs_interpolate(): links the lower filesystem into ecryptfs 598 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
590 */ 599 */
591static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags, 600static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
592 const char *dev_name, void *raw_data, 601 const char *dev_name, void *raw_data,
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index f1c17e87c5fb..2d8dbce9d485 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -20,6 +20,7 @@
20 * 02111-1307, USA. 20 * 02111-1307, USA.
21 */ 21 */
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/slab.h>
23#include <linux/user_namespace.h> 24#include <linux/user_namespace.h>
24#include <linux/nsproxy.h> 25#include <linux/nsproxy.h>
25#include "ecryptfs_kernel.h" 26#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 4ec8f61ccf5a..3745f612bcd4 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -24,6 +24,7 @@
24#include <linux/random.h> 24#include <linux/random.h>
25#include <linux/miscdevice.h> 25#include <linux/miscdevice.h>
26#include <linux/poll.h> 26#include <linux/poll.h>
27#include <linux/slab.h>
27#include <linux/wait.h> 28#include <linux/wait.h>
28#include <linux/module.h> 29#include <linux/module.h>
29#include "ecryptfs_kernel.h" 30#include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index df4ce99d0597..2ee9a3a7b68c 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -32,6 +32,7 @@
32#include <linux/file.h> 32#include <linux/file.h>
33#include <linux/crypto.h> 33#include <linux/crypto.h>
34#include <linux/scatterlist.h> 34#include <linux/scatterlist.h>
35#include <linux/slab.h>
35#include <asm/unaligned.h> 36#include <asm/unaligned.h>
36#include "ecryptfs_kernel.h" 37#include "ecryptfs_kernel.h"
37 38
@@ -82,6 +83,19 @@ out:
82 return rc; 83 return rc;
83} 84}
84 85
86static void strip_xattr_flag(char *page_virt,
87 struct ecryptfs_crypt_stat *crypt_stat)
88{
89 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
90 size_t written;
91
92 crypt_stat->flags &= ~ECRYPTFS_METADATA_IN_XATTR;
93 ecryptfs_write_crypt_stat_flags(page_virt, crypt_stat,
94 &written);
95 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
96 }
97}
98
85/** 99/**
86 * Header Extent: 100 * Header Extent:
87 * Octets 0-7: Unencrypted file size (big-endian) 101 * Octets 0-7: Unencrypted file size (big-endian)
@@ -97,19 +111,6 @@ out:
97 * (big-endian) 111 * (big-endian)
98 * Octet 26: Begin RFC 2440 authentication token packet set 112 * Octet 26: Begin RFC 2440 authentication token packet set
99 */ 113 */
100static void set_header_info(char *page_virt,
101 struct ecryptfs_crypt_stat *crypt_stat)
102{
103 size_t written;
104 size_t save_num_header_bytes_at_front =
105 crypt_stat->num_header_bytes_at_front;
106
107 crypt_stat->num_header_bytes_at_front =
108 ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
109 ecryptfs_write_header_metadata(page_virt + 20, crypt_stat, &written);
110 crypt_stat->num_header_bytes_at_front =
111 save_num_header_bytes_at_front;
112}
113 114
114/** 115/**
115 * ecryptfs_copy_up_encrypted_with_header 116 * ecryptfs_copy_up_encrypted_with_header
@@ -135,8 +136,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
135 * num_extents_per_page) 136 * num_extents_per_page)
136 + extent_num_in_page); 137 + extent_num_in_page);
137 size_t num_header_extents_at_front = 138 size_t num_header_extents_at_front =
138 (crypt_stat->num_header_bytes_at_front 139 (crypt_stat->metadata_size / crypt_stat->extent_size);
139 / crypt_stat->extent_size);
140 140
141 if (view_extent_num < num_header_extents_at_front) { 141 if (view_extent_num < num_header_extents_at_front) {
142 /* This is a header extent */ 142 /* This is a header extent */
@@ -146,9 +146,14 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
146 memset(page_virt, 0, PAGE_CACHE_SIZE); 146 memset(page_virt, 0, PAGE_CACHE_SIZE);
147 /* TODO: Support more than one header extent */ 147 /* TODO: Support more than one header extent */
148 if (view_extent_num == 0) { 148 if (view_extent_num == 0) {
149 size_t written;
150
149 rc = ecryptfs_read_xattr_region( 151 rc = ecryptfs_read_xattr_region(
150 page_virt, page->mapping->host); 152 page_virt, page->mapping->host);
151 set_header_info(page_virt, crypt_stat); 153 strip_xattr_flag(page_virt + 16, crypt_stat);
154 ecryptfs_write_header_metadata(page_virt + 20,
155 crypt_stat,
156 &written);
152 } 157 }
153 kunmap_atomic(page_virt, KM_USER0); 158 kunmap_atomic(page_virt, KM_USER0);
154 flush_dcache_page(page); 159 flush_dcache_page(page);
@@ -161,7 +166,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
161 /* This is an encrypted data extent */ 166 /* This is an encrypted data extent */
162 loff_t lower_offset = 167 loff_t lower_offset =
163 ((view_extent_num * crypt_stat->extent_size) 168 ((view_extent_num * crypt_stat->extent_size)
164 - crypt_stat->num_header_bytes_at_front); 169 - crypt_stat->metadata_size);
165 170
166 rc = ecryptfs_read_lower_page_segment( 171 rc = ecryptfs_read_lower_page_segment(
167 page, (lower_offset >> PAGE_CACHE_SHIFT), 172 page, (lower_offset >> PAGE_CACHE_SHIFT),
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index b15a43a80ab7..0c0ae491d231 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -26,6 +26,7 @@
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/key.h> 28#include <linux/key.h>
29#include <linux/slab.h>
29#include <linux/seq_file.h> 30#include <linux/seq_file.h>
30#include <linux/smp_lock.h> 31#include <linux/smp_lock.h>
31#include <linux/file.h> 32#include <linux/file.h>
@@ -85,7 +86,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
85 if (lower_dentry->d_inode) { 86 if (lower_dentry->d_inode) {
86 fput(inode_info->lower_file); 87 fput(inode_info->lower_file);
87 inode_info->lower_file = NULL; 88 inode_info->lower_file = NULL;
88 d_drop(lower_dentry);
89 } 89 }
90 } 90 }
91 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); 91 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
@@ -122,6 +122,7 @@ static void ecryptfs_put_super(struct super_block *sb)
122 lock_kernel(); 122 lock_kernel();
123 123
124 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); 124 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
125 bdi_destroy(&sb_info->bdi);
125 kmem_cache_free(ecryptfs_sb_info_cache, sb_info); 126 kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
126 ecryptfs_set_superblock_private(sb, NULL); 127 ecryptfs_set_superblock_private(sb, NULL);
127 128
diff --git a/fs/eventfd.c b/fs/eventfd.c
index d26402ff06ea..6bd3f76fdf88 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -11,6 +11,7 @@
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/slab.h>
14#include <linux/list.h> 15#include <linux/list.h>
15#include <linux/spinlock.h> 16#include <linux/spinlock.h>
16#include <linux/anon_inodes.h> 17#include <linux/anon_inodes.h>
@@ -135,26 +136,71 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
135 return events; 136 return events;
136} 137}
137 138
138static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, 139static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
139 loff_t *ppos) 140{
141 *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
142 ctx->count -= *cnt;
143}
144
145/**
146 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
147 * @ctx: [in] Pointer to eventfd context.
148 * @wait: [in] Wait queue to be removed.
149 * @cnt: [out] Pointer to the 64bit conter value.
150 *
151 * Returns zero if successful, or the following error codes:
152 *
153 * -EAGAIN : The operation would have blocked.
154 *
155 * This is used to atomically remove a wait queue entry from the eventfd wait
156 * queue head, and read/reset the counter value.
157 */
158int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
159 __u64 *cnt)
160{
161 unsigned long flags;
162
163 spin_lock_irqsave(&ctx->wqh.lock, flags);
164 eventfd_ctx_do_read(ctx, cnt);
165 __remove_wait_queue(&ctx->wqh, wait);
166 if (*cnt != 0 && waitqueue_active(&ctx->wqh))
167 wake_up_locked_poll(&ctx->wqh, POLLOUT);
168 spin_unlock_irqrestore(&ctx->wqh.lock, flags);
169
170 return *cnt != 0 ? 0 : -EAGAIN;
171}
172EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
173
174/**
175 * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
176 * @ctx: [in] Pointer to eventfd context.
177 * @no_wait: [in] Different from zero if the operation should not block.
178 * @cnt: [out] Pointer to the 64bit conter value.
179 *
180 * Returns zero if successful, or the following error codes:
181 *
182 * -EAGAIN : The operation would have blocked but @no_wait was nonzero.
183 * -ERESTARTSYS : A signal interrupted the wait operation.
184 *
185 * If @no_wait is zero, the function might sleep until the eventfd internal
186 * counter becomes greater than zero.
187 */
188ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
140{ 189{
141 struct eventfd_ctx *ctx = file->private_data;
142 ssize_t res; 190 ssize_t res;
143 __u64 ucnt = 0;
144 DECLARE_WAITQUEUE(wait, current); 191 DECLARE_WAITQUEUE(wait, current);
145 192
146 if (count < sizeof(ucnt))
147 return -EINVAL;
148 spin_lock_irq(&ctx->wqh.lock); 193 spin_lock_irq(&ctx->wqh.lock);
194 *cnt = 0;
149 res = -EAGAIN; 195 res = -EAGAIN;
150 if (ctx->count > 0) 196 if (ctx->count > 0)
151 res = sizeof(ucnt); 197 res = 0;
152 else if (!(file->f_flags & O_NONBLOCK)) { 198 else if (!no_wait) {
153 __add_wait_queue(&ctx->wqh, &wait); 199 __add_wait_queue(&ctx->wqh, &wait);
154 for (res = 0;;) { 200 for (;;) {
155 set_current_state(TASK_INTERRUPTIBLE); 201 set_current_state(TASK_INTERRUPTIBLE);
156 if (ctx->count > 0) { 202 if (ctx->count > 0) {
157 res = sizeof(ucnt); 203 res = 0;
158 break; 204 break;
159 } 205 }
160 if (signal_pending(current)) { 206 if (signal_pending(current)) {
@@ -168,18 +214,32 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
168 __remove_wait_queue(&ctx->wqh, &wait); 214 __remove_wait_queue(&ctx->wqh, &wait);
169 __set_current_state(TASK_RUNNING); 215 __set_current_state(TASK_RUNNING);
170 } 216 }
171 if (likely(res > 0)) { 217 if (likely(res == 0)) {
172 ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; 218 eventfd_ctx_do_read(ctx, cnt);
173 ctx->count -= ucnt;
174 if (waitqueue_active(&ctx->wqh)) 219 if (waitqueue_active(&ctx->wqh))
175 wake_up_locked_poll(&ctx->wqh, POLLOUT); 220 wake_up_locked_poll(&ctx->wqh, POLLOUT);
176 } 221 }
177 spin_unlock_irq(&ctx->wqh.lock); 222 spin_unlock_irq(&ctx->wqh.lock);
178 if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
179 return -EFAULT;
180 223
181 return res; 224 return res;
182} 225}
226EXPORT_SYMBOL_GPL(eventfd_ctx_read);
227
228static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
229 loff_t *ppos)
230{
231 struct eventfd_ctx *ctx = file->private_data;
232 ssize_t res;
233 __u64 cnt;
234
235 if (count < sizeof(cnt))
236 return -EINVAL;
237 res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
238 if (res < 0)
239 return res;
240
241 return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
242}
183 243
184static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, 244static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
185 loff_t *ppos) 245 loff_t *ppos)
diff --git a/fs/exec.c b/fs/exec.c
index 632b02e34ec7..49cdaa19e5b9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -195,7 +195,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
195 * to work from. 195 * to work from.
196 */ 196 */
197 rlim = current->signal->rlim; 197 rlim = current->signal->rlim;
198 if (size > rlim[RLIMIT_STACK].rlim_cur / 4) { 198 if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) {
199 put_page(page); 199 put_page(page);
200 return NULL; 200 return NULL;
201 } 201 }
@@ -246,6 +246,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
246 vma->vm_start = vma->vm_end - PAGE_SIZE; 246 vma->vm_start = vma->vm_end - PAGE_SIZE;
247 vma->vm_flags = VM_STACK_FLAGS; 247 vma->vm_flags = VM_STACK_FLAGS;
248 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 248 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
249 INIT_LIST_HEAD(&vma->anon_vma_chain);
249 err = insert_vm_struct(mm, vma); 250 err = insert_vm_struct(mm, vma);
250 if (err) 251 if (err)
251 goto err; 252 goto err;
@@ -516,7 +517,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
516 /* 517 /*
517 * cover the whole range: [new_start, old_end) 518 * cover the whole range: [new_start, old_end)
518 */ 519 */
519 vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL); 520 if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
521 return -ENOMEM;
520 522
521 /* 523 /*
522 * move the page tables downwards, on failure we rely on 524 * move the page tables downwards, on failure we rely on
@@ -547,15 +549,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
547 tlb_finish_mmu(tlb, new_end, old_end); 549 tlb_finish_mmu(tlb, new_end, old_end);
548 550
549 /* 551 /*
550 * shrink the vma to just the new range. 552 * Shrink the vma to just the new range. Always succeeds.
551 */ 553 */
552 vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); 554 vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
553 555
554 return 0; 556 return 0;
555} 557}
556 558
557#define EXTRA_STACK_VM_PAGES 20 /* random */
558
559/* 559/*
560 * Finalizes the stack vm_area_struct. The flags and permissions are updated, 560 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
561 * the stack is optionally relocated, and some extra space is added. 561 * the stack is optionally relocated, and some extra space is added.
@@ -571,10 +571,13 @@ int setup_arg_pages(struct linux_binprm *bprm,
571 struct vm_area_struct *prev = NULL; 571 struct vm_area_struct *prev = NULL;
572 unsigned long vm_flags; 572 unsigned long vm_flags;
573 unsigned long stack_base; 573 unsigned long stack_base;
574 unsigned long stack_size;
575 unsigned long stack_expand;
576 unsigned long rlim_stack;
574 577
575#ifdef CONFIG_STACK_GROWSUP 578#ifdef CONFIG_STACK_GROWSUP
576 /* Limit stack size to 1GB */ 579 /* Limit stack size to 1GB */
577 stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max; 580 stack_base = rlimit_max(RLIMIT_STACK);
578 if (stack_base > (1 << 30)) 581 if (stack_base > (1 << 30))
579 stack_base = 1 << 30; 582 stack_base = 1 << 30;
580 583
@@ -627,10 +630,23 @@ int setup_arg_pages(struct linux_binprm *bprm,
627 goto out_unlock; 630 goto out_unlock;
628 } 631 }
629 632
633 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
634 stack_size = vma->vm_end - vma->vm_start;
635 /*
636 * Align this down to a page boundary as expand_stack
637 * will align it up.
638 */
639 rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
630#ifdef CONFIG_STACK_GROWSUP 640#ifdef CONFIG_STACK_GROWSUP
631 stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE; 641 if (stack_size + stack_expand > rlim_stack)
642 stack_base = vma->vm_start + rlim_stack;
643 else
644 stack_base = vma->vm_end + stack_expand;
632#else 645#else
633 stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE; 646 if (stack_size + stack_expand > rlim_stack)
647 stack_base = vma->vm_end - rlim_stack;
648 else
649 stack_base = vma->vm_start - stack_expand;
634#endif 650#endif
635 ret = expand_stack(vma, stack_base); 651 ret = expand_stack(vma, stack_base);
636 if (ret) 652 if (ret)
@@ -702,6 +718,7 @@ static int exec_mmap(struct mm_struct *mm)
702 /* Notify parent that we're no longer interested in the old VM */ 718 /* Notify parent that we're no longer interested in the old VM */
703 tsk = current; 719 tsk = current;
704 old_mm = current->mm; 720 old_mm = current->mm;
721 sync_mm_rss(tsk, old_mm);
705 mm_release(tsk, old_mm); 722 mm_release(tsk, old_mm);
706 723
707 if (old_mm) { 724 if (old_mm) {
@@ -941,9 +958,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
941 958
942int flush_old_exec(struct linux_binprm * bprm) 959int flush_old_exec(struct linux_binprm * bprm)
943{ 960{
944 char * name; 961 int retval;
945 int i, ch, retval;
946 char tcomm[sizeof(current->comm)];
947 962
948 /* 963 /*
949 * Make sure we have a private signal table and that 964 * Make sure we have a private signal table and that
@@ -964,6 +979,25 @@ int flush_old_exec(struct linux_binprm * bprm)
964 979
965 bprm->mm = NULL; /* We're using it now */ 980 bprm->mm = NULL; /* We're using it now */
966 981
982 current->flags &= ~PF_RANDOMIZE;
983 flush_thread();
984 current->personality &= ~bprm->per_clear;
985
986 return 0;
987
988out:
989 return retval;
990}
991EXPORT_SYMBOL(flush_old_exec);
992
993void setup_new_exec(struct linux_binprm * bprm)
994{
995 int i, ch;
996 char * name;
997 char tcomm[sizeof(current->comm)];
998
999 arch_pick_mmap_layout(current->mm);
1000
967 /* This is the point of no return */ 1001 /* This is the point of no return */
968 current->sas_ss_sp = current->sas_ss_size = 0; 1002 current->sas_ss_sp = current->sas_ss_size = 0;
969 1003
@@ -985,9 +1019,6 @@ int flush_old_exec(struct linux_binprm * bprm)
985 tcomm[i] = '\0'; 1019 tcomm[i] = '\0';
986 set_task_comm(current, tcomm); 1020 set_task_comm(current, tcomm);
987 1021
988 current->flags &= ~PF_RANDOMIZE;
989 flush_thread();
990
991 /* Set the new mm task size. We have to do that late because it may 1022 /* Set the new mm task size. We have to do that late because it may
992 * depend on TIF_32BIT which is only updated in flush_thread() on 1023 * depend on TIF_32BIT which is only updated in flush_thread() on
993 * some architectures like powerpc 1024 * some architectures like powerpc
@@ -1003,8 +1034,6 @@ int flush_old_exec(struct linux_binprm * bprm)
1003 set_dumpable(current->mm, suid_dumpable); 1034 set_dumpable(current->mm, suid_dumpable);
1004 } 1035 }
1005 1036
1006 current->personality &= ~bprm->per_clear;
1007
1008 /* 1037 /*
1009 * Flush performance counters when crossing a 1038 * Flush performance counters when crossing a
1010 * security domain: 1039 * security domain:
@@ -1019,14 +1048,8 @@ int flush_old_exec(struct linux_binprm * bprm)
1019 1048
1020 flush_signal_handlers(current, 0); 1049 flush_signal_handlers(current, 0);
1021 flush_old_files(current->files); 1050 flush_old_files(current->files);
1022
1023 return 0;
1024
1025out:
1026 return retval;
1027} 1051}
1028 1052EXPORT_SYMBOL(setup_new_exec);
1029EXPORT_SYMBOL(flush_old_exec);
1030 1053
1031/* 1054/*
1032 * Prepare credentials and lock ->cred_guard_mutex. 1055 * Prepare credentials and lock ->cred_guard_mutex.
@@ -1510,7 +1533,7 @@ static int format_corename(char *corename, long signr)
1510 /* core limit size */ 1533 /* core limit size */
1511 case 'c': 1534 case 'c':
1512 rc = snprintf(out_ptr, out_end - out_ptr, 1535 rc = snprintf(out_ptr, out_end - out_ptr,
1513 "%lu", current->signal->rlim[RLIMIT_CORE].rlim_cur); 1536 "%lu", rlimit(RLIMIT_CORE));
1514 if (rc > out_end - out_ptr) 1537 if (rc > out_end - out_ptr)
1515 goto out; 1538 goto out;
1516 out_ptr += rc; 1539 out_ptr += rc;
@@ -1538,12 +1561,13 @@ out:
1538 return ispipe; 1561 return ispipe;
1539} 1562}
1540 1563
1541static int zap_process(struct task_struct *start) 1564static int zap_process(struct task_struct *start, int exit_code)
1542{ 1565{
1543 struct task_struct *t; 1566 struct task_struct *t;
1544 int nr = 0; 1567 int nr = 0;
1545 1568
1546 start->signal->flags = SIGNAL_GROUP_EXIT; 1569 start->signal->flags = SIGNAL_GROUP_EXIT;
1570 start->signal->group_exit_code = exit_code;
1547 start->signal->group_stop_count = 0; 1571 start->signal->group_stop_count = 0;
1548 1572
1549 t = start; 1573 t = start;
@@ -1568,8 +1592,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
1568 spin_lock_irq(&tsk->sighand->siglock); 1592 spin_lock_irq(&tsk->sighand->siglock);
1569 if (!signal_group_exit(tsk->signal)) { 1593 if (!signal_group_exit(tsk->signal)) {
1570 mm->core_state = core_state; 1594 mm->core_state = core_state;
1571 tsk->signal->group_exit_code = exit_code; 1595 nr = zap_process(tsk, exit_code);
1572 nr = zap_process(tsk);
1573 } 1596 }
1574 spin_unlock_irq(&tsk->sighand->siglock); 1597 spin_unlock_irq(&tsk->sighand->siglock);
1575 if (unlikely(nr < 0)) 1598 if (unlikely(nr < 0))
@@ -1618,7 +1641,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
1618 if (p->mm) { 1641 if (p->mm) {
1619 if (unlikely(p->mm == mm)) { 1642 if (unlikely(p->mm == mm)) {
1620 lock_task_sighand(p, &flags); 1643 lock_task_sighand(p, &flags);
1621 nr += zap_process(p); 1644 nr += zap_process(p, exit_code);
1622 unlock_task_sighand(p, &flags); 1645 unlock_task_sighand(p, &flags);
1623 } 1646 }
1624 break; 1647 break;
@@ -1725,14 +1748,19 @@ void set_dumpable(struct mm_struct *mm, int value)
1725 } 1748 }
1726} 1749}
1727 1750
1728int get_dumpable(struct mm_struct *mm) 1751static int __get_dumpable(unsigned long mm_flags)
1729{ 1752{
1730 int ret; 1753 int ret;
1731 1754
1732 ret = mm->flags & 0x3; 1755 ret = mm_flags & MMF_DUMPABLE_MASK;
1733 return (ret >= 2) ? 2 : ret; 1756 return (ret >= 2) ? 2 : ret;
1734} 1757}
1735 1758
1759int get_dumpable(struct mm_struct *mm)
1760{
1761 return __get_dumpable(mm->flags);
1762}
1763
1736static void wait_for_dump_helpers(struct file *file) 1764static void wait_for_dump_helpers(struct file *file)
1737{ 1765{
1738 struct pipe_inode_info *pipe; 1766 struct pipe_inode_info *pipe;
@@ -1775,7 +1803,13 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1775 struct coredump_params cprm = { 1803 struct coredump_params cprm = {
1776 .signr = signr, 1804 .signr = signr,
1777 .regs = regs, 1805 .regs = regs,
1778 .limit = current->signal->rlim[RLIMIT_CORE].rlim_cur, 1806 .limit = rlimit(RLIMIT_CORE),
1807 /*
1808 * We must use the same mm->flags while dumping core to avoid
1809 * inconsistency of bit flags, since this flag is not protected
1810 * by any locks.
1811 */
1812 .mm_flags = mm->flags,
1779 }; 1813 };
1780 1814
1781 audit_core_dumps(signr); 1815 audit_core_dumps(signr);
@@ -1794,7 +1828,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1794 /* 1828 /*
1795 * If another thread got here first, or we are not dumpable, bail out. 1829 * If another thread got here first, or we are not dumpable, bail out.
1796 */ 1830 */
1797 if (mm->core_state || !get_dumpable(mm)) { 1831 if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
1798 up_write(&mm->mmap_sem); 1832 up_write(&mm->mmap_sem);
1799 put_cred(cred); 1833 put_cred(cred);
1800 goto fail; 1834 goto fail;
@@ -1805,7 +1839,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1805 * process nor do we know its entire history. We only know it 1839 * process nor do we know its entire history. We only know it
1806 * was tainted so we dump it as root in mode 2. 1840 * was tainted so we dump it as root in mode 2.
1807 */ 1841 */
1808 if (get_dumpable(mm) == 2) { /* Setuid core dump mode */ 1842 if (__get_dumpable(cprm.mm_flags) == 2) {
1843 /* Setuid core dump mode */
1809 flag = O_EXCL; /* Stop rewrite attacks */ 1844 flag = O_EXCL; /* Stop rewrite attacks */
1810 cred->fsuid = 0; /* Dump root private */ 1845 cred->fsuid = 0; /* Dump root private */
1811 } 1846 }
@@ -1901,8 +1936,9 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1901 /* 1936 /*
1902 * Dont allow local users get cute and trick others to coredump 1937 * Dont allow local users get cute and trick others to coredump
1903 * into their pre-created files: 1938 * into their pre-created files:
1939 * Note, this is not relevant for pipes
1904 */ 1940 */
1905 if (inode->i_uid != current_fsuid()) 1941 if (!ispipe && (inode->i_uid != current_fsuid()))
1906 goto close_fail; 1942 goto close_fail;
1907 if (!cprm.file->f_op) 1943 if (!cprm.file->f_op)
1908 goto close_fail; 1944 goto close_fail;
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index b1b178e61718..f0d520312d8b 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -55,6 +55,8 @@
55/* exofs Application specific page/attribute */ 55/* exofs Application specific page/attribute */
56# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) 56# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
57# define EXOFS_ATTR_INODE_DATA 1 57# define EXOFS_ATTR_INODE_DATA 1
58# define EXOFS_ATTR_INODE_FILE_LAYOUT 2
59# define EXOFS_ATTR_INODE_DIR_LAYOUT 3
58 60
59/* 61/*
60 * The maximum number of files we can have is limited by the size of the 62 * The maximum number of files we can have is limited by the size of the
@@ -206,4 +208,41 @@ enum {
206 (((name_len) + offsetof(struct exofs_dir_entry, name) + \ 208 (((name_len) + offsetof(struct exofs_dir_entry, name) + \
207 EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) 209 EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
208 210
211/*
212 * The on-disk (optional) layout structure.
213 * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT
214 * attribute, attached to any inode, usually to a directory.
215 */
216
217enum exofs_inode_layout_gen_functions {
218 LAYOUT_MOVING_WINDOW = 0,
219 LAYOUT_IMPLICT = 1,
220};
221
222struct exofs_on_disk_inode_layout {
223 __le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */
224 __le16 pad;
225 union {
226 /* gen_func == LAYOUT_MOVING_WINDOW (default) */
227 struct exofs_layout_sliding_window {
228 __le32 num_devices; /* first n devices in global-table*/
229 } sliding_window __packed;
230
231 /* gen_func == LAYOUT_IMPLICT */
232 struct exofs_layout_implict_list {
233 struct exofs_dt_data_map data_map;
234 /* Variable array of size data_map.cb_num_comps. These
235 * are device indexes of the devices in the global table
236 */
237 __le32 dev_indexes[];
238 } implict __packed;
239 };
240} __packed;
241
242static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs)
243{
244 return sizeof(struct exofs_on_disk_inode_layout) +
245 max_devs * sizeof(__le32);
246}
247
209#endif /*ifndef __EXOFS_COM_H__*/ 248#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index c35fd4623986..54373278a353 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -35,6 +35,7 @@
35 35
36#include <linux/fs.h> 36#include <linux/fs.h>
37#include <linux/time.h> 37#include <linux/time.h>
38#include <linux/backing-dev.h>
38#include "common.h" 39#include "common.h"
39 40
40/* FIXME: Remove once pnfs hits mainline 41/* FIXME: Remove once pnfs hits mainline
@@ -55,12 +56,28 @@
55/* u64 has problems with printk this will cast it to unsigned long long */ 56/* u64 has problems with printk this will cast it to unsigned long long */
56#define _LLU(x) (unsigned long long)(x) 57#define _LLU(x) (unsigned long long)(x)
57 58
59struct exofs_layout {
60 osd_id s_pid; /* partition ID of file system*/
61
62 /* Our way of looking at the data_map */
63 unsigned stripe_unit;
64 unsigned mirrors_p1;
65
66 unsigned group_width;
67 u64 group_depth;
68 unsigned group_count;
69
70 enum exofs_inode_layout_gen_functions lay_func;
71
72 unsigned s_numdevs; /* Num of devices in array */
73 struct osd_dev *s_ods[0]; /* Variable length */
74};
75
58/* 76/*
59 * our extension to the in-memory superblock 77 * our extension to the in-memory superblock
60 */ 78 */
61struct exofs_sb_info { 79struct exofs_sb_info {
62 struct exofs_fscb s_fscb; /* Written often, pre-allocate*/ 80 struct exofs_fscb s_fscb; /* Written often, pre-allocate*/
63 osd_id s_pid; /* partition ID of file system*/
64 int s_timeout; /* timeout for OSD operations */ 81 int s_timeout; /* timeout for OSD operations */
65 uint64_t s_nextid; /* highest object ID used */ 82 uint64_t s_nextid; /* highest object ID used */
66 uint32_t s_numfiles; /* number of files on fs */ 83 uint32_t s_numfiles; /* number of files on fs */
@@ -69,22 +86,28 @@ struct exofs_sb_info {
69 atomic_t s_curr_pending; /* number of pending commands */ 86 atomic_t s_curr_pending; /* number of pending commands */
70 uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ 87 uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */
71 88
72 struct pnfs_osd_data_map data_map; /* Default raid to use */ 89 struct pnfs_osd_data_map data_map; /* Default raid to use
73 unsigned s_numdevs; /* Num of devices in array */ 90 * FIXME: Needed ?
74 struct osd_dev *s_ods[1]; /* Variable length, minimum 1 */ 91 */
92/* struct exofs_layout dir_layout;*/ /* Default dir layout */
93 struct exofs_layout layout; /* Default files layout,
94 * contains the variable osd_dev
95 * array. Keep last */
96 struct backing_dev_info bdi;
97 struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */
75}; 98};
76 99
77/* 100/*
78 * our extension to the in-memory inode 101 * our extension to the in-memory inode
79 */ 102 */
80struct exofs_i_info { 103struct exofs_i_info {
104 struct inode vfs_inode; /* normal in-memory inode */
105 wait_queue_head_t i_wq; /* wait queue for inode */
81 unsigned long i_flags; /* various atomic flags */ 106 unsigned long i_flags; /* various atomic flags */
82 uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ 107 uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/
83 uint32_t i_dir_start_lookup; /* which page to start lookup */ 108 uint32_t i_dir_start_lookup; /* which page to start lookup */
84 wait_queue_head_t i_wq; /* wait queue for inode */
85 uint64_t i_commit_size; /* the object's written length */ 109 uint64_t i_commit_size; /* the object's written length */
86 uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ 110 uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */
87 struct inode vfs_inode; /* normal in-memory inode */
88}; 111};
89 112
90static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) 113static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
@@ -101,7 +124,7 @@ struct exofs_io_state {
101 void *private; 124 void *private;
102 exofs_io_done_fn done; 125 exofs_io_done_fn done;
103 126
104 struct exofs_sb_info *sbi; 127 struct exofs_layout *layout;
105 struct osd_obj_id obj; 128 struct osd_obj_id obj;
106 u8 *cred; 129 u8 *cred;
107 130
@@ -109,7 +132,11 @@ struct exofs_io_state {
109 loff_t offset; 132 loff_t offset;
110 unsigned long length; 133 unsigned long length;
111 void *kern_buff; 134 void *kern_buff;
112 struct bio *bio; 135
136 struct page **pages;
137 unsigned nr_pages;
138 unsigned pgbase;
139 unsigned pages_consumed;
113 140
114 /* Attributes */ 141 /* Attributes */
115 unsigned in_attr_len; 142 unsigned in_attr_len;
@@ -122,6 +149,9 @@ struct exofs_io_state {
122 struct exofs_per_dev_state { 149 struct exofs_per_dev_state {
123 struct osd_request *or; 150 struct osd_request *or;
124 struct bio *bio; 151 struct bio *bio;
152 loff_t offset;
153 unsigned length;
154 unsigned dev;
125 } per_dev[]; 155 } per_dev[];
126}; 156};
127 157
@@ -175,6 +205,12 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
175} 205}
176 206
177/* 207/*
208 * Given a layout, object_number and stripe_index return the associated global
209 * dev_index
210 */
211unsigned exofs_layout_od_id(struct exofs_layout *layout,
212 osd_id obj_no, unsigned layout_index);
213/*
178 * Maximum count of links to a file 214 * Maximum count of links to a file
179 */ 215 */
180#define EXOFS_LINK_MAX 32000 216#define EXOFS_LINK_MAX 32000
@@ -189,7 +225,8 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
189int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, 225int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
190 u64 offset, void *p, unsigned length); 226 u64 offset, void *p, unsigned length);
191 227
192int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios); 228int exofs_get_io_state(struct exofs_layout *layout,
229 struct exofs_io_state **ios);
193void exofs_put_io_state(struct exofs_io_state *ios); 230void exofs_put_io_state(struct exofs_io_state *ios);
194 231
195int exofs_check_io(struct exofs_io_state *ios, u64 *resid); 232int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
@@ -226,7 +263,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
226 struct page **pagep, void **fsdata); 263 struct page **pagep, void **fsdata);
227extern struct inode *exofs_iget(struct super_block *, unsigned long); 264extern struct inode *exofs_iget(struct super_block *, unsigned long);
228struct inode *exofs_new_inode(struct inode *, int); 265struct inode *exofs_new_inode(struct inode *, int);
229extern int exofs_write_inode(struct inode *, int); 266extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
230extern void exofs_delete_inode(struct inode *); 267extern void exofs_delete_inode(struct inode *);
231 268
232/* dir.c: */ 269/* dir.c: */
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 698a8636d39c..76d2a79ef93e 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -31,6 +31,7 @@
31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 */ 32 */
33 33
34#include <linux/slab.h>
34#include <linux/writeback.h> 35#include <linux/writeback.h>
35#include <linux/buffer_head.h> 36#include <linux/buffer_head.h>
36#include <scsi/scsi_device.h> 37#include <scsi/scsi_device.h>
@@ -41,16 +42,18 @@
41 42
42enum { BIO_MAX_PAGES_KMALLOC = 43enum { BIO_MAX_PAGES_KMALLOC =
43 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), 44 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
45 MAX_PAGES_KMALLOC =
46 PAGE_SIZE / sizeof(struct page *),
44}; 47};
45 48
46struct page_collect { 49struct page_collect {
47 struct exofs_sb_info *sbi; 50 struct exofs_sb_info *sbi;
48 struct request_queue *req_q;
49 struct inode *inode; 51 struct inode *inode;
50 unsigned expected_pages; 52 unsigned expected_pages;
51 struct exofs_io_state *ios; 53 struct exofs_io_state *ios;
52 54
53 struct bio *bio; 55 struct page **pages;
56 unsigned alloc_pages;
54 unsigned nr_pages; 57 unsigned nr_pages;
55 unsigned long length; 58 unsigned long length;
56 loff_t pg_first; /* keep 64bit also in 32-arches */ 59 loff_t pg_first; /* keep 64bit also in 32-arches */
@@ -62,15 +65,12 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
62 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; 65 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
63 66
64 pcol->sbi = sbi; 67 pcol->sbi = sbi;
65 /* Create master bios on first Q, later on cloning, each clone will be
66 * allocated on it's destination Q
67 */
68 pcol->req_q = osd_request_queue(sbi->s_ods[0]);
69 pcol->inode = inode; 68 pcol->inode = inode;
70 pcol->expected_pages = expected_pages; 69 pcol->expected_pages = expected_pages;
71 70
72 pcol->ios = NULL; 71 pcol->ios = NULL;
73 pcol->bio = NULL; 72 pcol->pages = NULL;
73 pcol->alloc_pages = 0;
74 pcol->nr_pages = 0; 74 pcol->nr_pages = 0;
75 pcol->length = 0; 75 pcol->length = 0;
76 pcol->pg_first = -1; 76 pcol->pg_first = -1;
@@ -80,7 +80,8 @@ static void _pcol_reset(struct page_collect *pcol)
80{ 80{
81 pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages); 81 pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
82 82
83 pcol->bio = NULL; 83 pcol->pages = NULL;
84 pcol->alloc_pages = 0;
84 pcol->nr_pages = 0; 85 pcol->nr_pages = 0;
85 pcol->length = 0; 86 pcol->length = 0;
86 pcol->pg_first = -1; 87 pcol->pg_first = -1;
@@ -90,38 +91,43 @@ static void _pcol_reset(struct page_collect *pcol)
90 * it might not end here. don't be left with nothing 91 * it might not end here. don't be left with nothing
91 */ 92 */
92 if (!pcol->expected_pages) 93 if (!pcol->expected_pages)
93 pcol->expected_pages = BIO_MAX_PAGES_KMALLOC; 94 pcol->expected_pages = MAX_PAGES_KMALLOC;
94} 95}
95 96
96static int pcol_try_alloc(struct page_collect *pcol) 97static int pcol_try_alloc(struct page_collect *pcol)
97{ 98{
98 int pages = min_t(unsigned, pcol->expected_pages, 99 unsigned pages = min_t(unsigned, pcol->expected_pages,
99 BIO_MAX_PAGES_KMALLOC); 100 MAX_PAGES_KMALLOC);
100 101
101 if (!pcol->ios) { /* First time allocate io_state */ 102 if (!pcol->ios) { /* First time allocate io_state */
102 int ret = exofs_get_io_state(pcol->sbi, &pcol->ios); 103 int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
103 104
104 if (ret) 105 if (ret)
105 return ret; 106 return ret;
106 } 107 }
107 108
109 /* TODO: easily support bio chaining */
110 pages = min_t(unsigned, pages,
111 pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC);
112
108 for (; pages; pages >>= 1) { 113 for (; pages; pages >>= 1) {
109 pcol->bio = bio_kmalloc(GFP_KERNEL, pages); 114 pcol->pages = kmalloc(pages * sizeof(struct page *),
110 if (likely(pcol->bio)) 115 GFP_KERNEL);
116 if (likely(pcol->pages)) {
117 pcol->alloc_pages = pages;
111 return 0; 118 return 0;
119 }
112 } 120 }
113 121
114 EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n", 122 EXOFS_ERR("Failed to kmalloc expected_pages=%u\n",
115 pcol->expected_pages); 123 pcol->expected_pages);
116 return -ENOMEM; 124 return -ENOMEM;
117} 125}
118 126
119static void pcol_free(struct page_collect *pcol) 127static void pcol_free(struct page_collect *pcol)
120{ 128{
121 if (pcol->bio) { 129 kfree(pcol->pages);
122 bio_put(pcol->bio); 130 pcol->pages = NULL;
123 pcol->bio = NULL;
124 }
125 131
126 if (pcol->ios) { 132 if (pcol->ios) {
127 exofs_put_io_state(pcol->ios); 133 exofs_put_io_state(pcol->ios);
@@ -132,11 +138,10 @@ static void pcol_free(struct page_collect *pcol)
132static int pcol_add_page(struct page_collect *pcol, struct page *page, 138static int pcol_add_page(struct page_collect *pcol, struct page *page,
133 unsigned len) 139 unsigned len)
134{ 140{
135 int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0); 141 if (unlikely(pcol->nr_pages >= pcol->alloc_pages))
136 if (unlikely(len != added_len))
137 return -ENOMEM; 142 return -ENOMEM;
138 143
139 ++pcol->nr_pages; 144 pcol->pages[pcol->nr_pages++] = page;
140 pcol->length += len; 145 pcol->length += len;
141 return 0; 146 return 0;
142} 147}
@@ -181,7 +186,6 @@ static void update_write_page(struct page *page, int ret)
181 */ 186 */
182static int __readpages_done(struct page_collect *pcol, bool do_unlock) 187static int __readpages_done(struct page_collect *pcol, bool do_unlock)
183{ 188{
184 struct bio_vec *bvec;
185 int i; 189 int i;
186 u64 resid; 190 u64 resid;
187 u64 good_bytes; 191 u64 good_bytes;
@@ -193,13 +197,13 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
193 else 197 else
194 good_bytes = pcol->length - resid; 198 good_bytes = pcol->length - resid;
195 199
196 EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx" 200 EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
197 " length=0x%lx nr_pages=%u\n", 201 " length=0x%lx nr_pages=%u\n",
198 pcol->inode->i_ino, _LLU(good_bytes), pcol->length, 202 pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
199 pcol->nr_pages); 203 pcol->nr_pages);
200 204
201 __bio_for_each_segment(bvec, pcol->bio, i, 0) { 205 for (i = 0; i < pcol->nr_pages; i++) {
202 struct page *page = bvec->bv_page; 206 struct page *page = pcol->pages[i];
203 struct inode *inode = page->mapping->host; 207 struct inode *inode = page->mapping->host;
204 int page_stat; 208 int page_stat;
205 209
@@ -218,11 +222,11 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
218 ret = update_read_page(page, page_stat); 222 ret = update_read_page(page, page_stat);
219 if (do_unlock) 223 if (do_unlock)
220 unlock_page(page); 224 unlock_page(page);
221 length += bvec->bv_len; 225 length += PAGE_SIZE;
222 } 226 }
223 227
224 pcol_free(pcol); 228 pcol_free(pcol);
225 EXOFS_DBGMSG("readpages_done END\n"); 229 EXOFS_DBGMSG2("readpages_done END\n");
226 return ret; 230 return ret;
227} 231}
228 232
@@ -238,11 +242,10 @@ static void readpages_done(struct exofs_io_state *ios, void *p)
238 242
239static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) 243static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
240{ 244{
241 struct bio_vec *bvec;
242 int i; 245 int i;
243 246
244 __bio_for_each_segment(bvec, pcol->bio, i, 0) { 247 for (i = 0; i < pcol->nr_pages; i++) {
245 struct page *page = bvec->bv_page; 248 struct page *page = pcol->pages[i];
246 249
247 if (rw == READ) 250 if (rw == READ)
248 update_read_page(page, ret); 251 update_read_page(page, ret);
@@ -260,13 +263,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
260 struct page_collect *pcol_copy = NULL; 263 struct page_collect *pcol_copy = NULL;
261 int ret; 264 int ret;
262 265
263 if (!pcol->bio) 266 if (!pcol->pages)
264 return 0; 267 return 0;
265 268
266 /* see comment in _readpage() about sync reads */ 269 /* see comment in _readpage() about sync reads */
267 WARN_ON(is_sync && (pcol->nr_pages != 1)); 270 WARN_ON(is_sync && (pcol->nr_pages != 1));
268 271
269 ios->bio = pcol->bio; 272 ios->pages = pcol->pages;
273 ios->nr_pages = pcol->nr_pages;
270 ios->length = pcol->length; 274 ios->length = pcol->length;
271 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; 275 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
272 276
@@ -290,7 +294,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
290 294
291 atomic_inc(&pcol->sbi->s_curr_pending); 295 atomic_inc(&pcol->sbi->s_curr_pending);
292 296
293 EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", 297 EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
294 ios->obj.id, _LLU(ios->offset), pcol->length); 298 ios->obj.id, _LLU(ios->offset), pcol->length);
295 299
296 /* pages ownership was passed to pcol_copy */ 300 /* pages ownership was passed to pcol_copy */
@@ -366,7 +370,7 @@ try_again:
366 goto try_again; 370 goto try_again;
367 } 371 }
368 372
369 if (!pcol->bio) { 373 if (!pcol->pages) {
370 ret = pcol_try_alloc(pcol); 374 ret = pcol_try_alloc(pcol);
371 if (unlikely(ret)) 375 if (unlikely(ret))
372 goto fail; 376 goto fail;
@@ -448,7 +452,6 @@ static int exofs_readpage(struct file *file, struct page *page)
448static void writepages_done(struct exofs_io_state *ios, void *p) 452static void writepages_done(struct exofs_io_state *ios, void *p)
449{ 453{
450 struct page_collect *pcol = p; 454 struct page_collect *pcol = p;
451 struct bio_vec *bvec;
452 int i; 455 int i;
453 u64 resid; 456 u64 resid;
454 u64 good_bytes; 457 u64 good_bytes;
@@ -462,13 +465,13 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
462 else 465 else
463 good_bytes = pcol->length - resid; 466 good_bytes = pcol->length - resid;
464 467
465 EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx" 468 EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
466 " length=0x%lx nr_pages=%u\n", 469 " length=0x%lx nr_pages=%u\n",
467 pcol->inode->i_ino, _LLU(good_bytes), pcol->length, 470 pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
468 pcol->nr_pages); 471 pcol->nr_pages);
469 472
470 __bio_for_each_segment(bvec, pcol->bio, i, 0) { 473 for (i = 0; i < pcol->nr_pages; i++) {
471 struct page *page = bvec->bv_page; 474 struct page *page = pcol->pages[i];
472 struct inode *inode = page->mapping->host; 475 struct inode *inode = page->mapping->host;
473 int page_stat; 476 int page_stat;
474 477
@@ -485,12 +488,12 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
485 EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n", 488 EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n",
486 inode->i_ino, page->index, page_stat); 489 inode->i_ino, page->index, page_stat);
487 490
488 length += bvec->bv_len; 491 length += PAGE_SIZE;
489 } 492 }
490 493
491 pcol_free(pcol); 494 pcol_free(pcol);
492 kfree(pcol); 495 kfree(pcol);
493 EXOFS_DBGMSG("writepages_done END\n"); 496 EXOFS_DBGMSG2("writepages_done END\n");
494} 497}
495 498
496static int write_exec(struct page_collect *pcol) 499static int write_exec(struct page_collect *pcol)
@@ -500,7 +503,7 @@ static int write_exec(struct page_collect *pcol)
500 struct page_collect *pcol_copy = NULL; 503 struct page_collect *pcol_copy = NULL;
501 int ret; 504 int ret;
502 505
503 if (!pcol->bio) 506 if (!pcol->pages)
504 return 0; 507 return 0;
505 508
506 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 509 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -512,9 +515,8 @@ static int write_exec(struct page_collect *pcol)
512 515
513 *pcol_copy = *pcol; 516 *pcol_copy = *pcol;
514 517
515 pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */ 518 ios->pages = pcol_copy->pages;
516 519 ios->nr_pages = pcol_copy->nr_pages;
517 ios->bio = pcol_copy->bio;
518 ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT; 520 ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
519 ios->length = pcol_copy->length; 521 ios->length = pcol_copy->length;
520 ios->done = writepages_done; 522 ios->done = writepages_done;
@@ -527,7 +529,7 @@ static int write_exec(struct page_collect *pcol)
527 } 529 }
528 530
529 atomic_inc(&pcol->sbi->s_curr_pending); 531 atomic_inc(&pcol->sbi->s_curr_pending);
530 EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", 532 EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
531 pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), 533 pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
532 pcol->length); 534 pcol->length);
533 /* pages ownership was passed to pcol_copy */ 535 /* pages ownership was passed to pcol_copy */
@@ -605,7 +607,7 @@ try_again:
605 goto try_again; 607 goto try_again;
606 } 608 }
607 609
608 if (!pcol->bio) { 610 if (!pcol->pages) {
609 ret = pcol_try_alloc(pcol); 611 ret = pcol_try_alloc(pcol);
610 if (unlikely(ret)) 612 if (unlikely(ret))
611 goto fail; 613 goto fail;
@@ -616,7 +618,7 @@ try_again:
616 618
617 ret = pcol_add_page(pcol, page, len); 619 ret = pcol_add_page(pcol, page, len);
618 if (unlikely(ret)) { 620 if (unlikely(ret)) {
619 EXOFS_DBGMSG("Failed pcol_add_page " 621 EXOFS_DBGMSG2("Failed pcol_add_page "
620 "nr_pages=%u total_length=0x%lx\n", 622 "nr_pages=%u total_length=0x%lx\n",
621 pcol->nr_pages, pcol->length); 623 pcol->nr_pages, pcol->length);
622 624
@@ -663,7 +665,7 @@ static int exofs_writepages(struct address_space *mapping,
663 if (expected_pages < 32L) 665 if (expected_pages < 32L)
664 expected_pages = 32L; 666 expected_pages = 32L;
665 667
666 EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx " 668 EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
667 "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n", 669 "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
668 mapping->host->i_ino, wbc->range_start, wbc->range_end, 670 mapping->host->i_ino, wbc->range_start, wbc->range_end,
669 mapping->nrpages, start, end, expected_pages); 671 mapping->nrpages, start, end, expected_pages);
@@ -738,13 +740,28 @@ static int exofs_write_begin_export(struct file *file,
738 fsdata); 740 fsdata);
739} 741}
740 742
743static int exofs_write_end(struct file *file, struct address_space *mapping,
744 loff_t pos, unsigned len, unsigned copied,
745 struct page *page, void *fsdata)
746{
747 struct inode *inode = mapping->host;
748 /* According to comment in simple_write_end i_mutex is held */
749 loff_t i_size = inode->i_size;
750 int ret;
751
752 ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata);
753 if (i_size != inode->i_size)
754 mark_inode_dirty(inode);
755 return ret;
756}
757
741const struct address_space_operations exofs_aops = { 758const struct address_space_operations exofs_aops = {
742 .readpage = exofs_readpage, 759 .readpage = exofs_readpage,
743 .readpages = exofs_readpages, 760 .readpages = exofs_readpages,
744 .writepage = exofs_writepage, 761 .writepage = exofs_writepage,
745 .writepages = exofs_writepages, 762 .writepages = exofs_writepages,
746 .write_begin = exofs_write_begin_export, 763 .write_begin = exofs_write_begin_export,
747 .write_end = simple_write_end, 764 .write_end = exofs_write_end,
748}; 765};
749 766
750/****************************************************************************** 767/******************************************************************************
@@ -844,20 +861,33 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
844 return error; 861 return error;
845} 862}
846 863
864static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
865 EXOFS_APAGE_FS_DATA,
866 EXOFS_ATTR_INODE_FILE_LAYOUT,
867 0);
868static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
869 EXOFS_APAGE_FS_DATA,
870 EXOFS_ATTR_INODE_DIR_LAYOUT,
871 0);
872
847/* 873/*
848 * Read an inode from the OSD, and return it as is. We also return the size 874 * Read the Linux inode info from the OSD, and return it as is. In exofs the
849 * attribute in the 'obj_size' argument. 875 * inode info is in an application specific page/attribute of the osd-object.
850 */ 876 */
851static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, 877static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
852 struct exofs_fcb *inode, uint64_t *obj_size) 878 struct exofs_fcb *inode)
853{ 879{
854 struct exofs_sb_info *sbi = sb->s_fs_info; 880 struct exofs_sb_info *sbi = sb->s_fs_info;
855 struct osd_attr attrs[2]; 881 struct osd_attr attrs[] = {
882 [0] = g_attr_inode_data,
883 [1] = g_attr_inode_file_layout,
884 [2] = g_attr_inode_dir_layout,
885 };
856 struct exofs_io_state *ios; 886 struct exofs_io_state *ios;
887 struct exofs_on_disk_inode_layout *layout;
857 int ret; 888 int ret;
858 889
859 *obj_size = ~0; 890 ret = exofs_get_io_state(&sbi->layout, &ios);
860 ret = exofs_get_io_state(sbi, &ios);
861 if (unlikely(ret)) { 891 if (unlikely(ret)) {
862 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 892 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
863 return ret; 893 return ret;
@@ -867,14 +897,25 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
867 exofs_make_credential(oi->i_cred, &ios->obj); 897 exofs_make_credential(oi->i_cred, &ios->obj);
868 ios->cred = oi->i_cred; 898 ios->cred = oi->i_cred;
869 899
870 attrs[0] = g_attr_inode_data; 900 attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
871 attrs[1] = g_attr_logical_length; 901 attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
902
872 ios->in_attr = attrs; 903 ios->in_attr = attrs;
873 ios->in_attr_len = ARRAY_SIZE(attrs); 904 ios->in_attr_len = ARRAY_SIZE(attrs);
874 905
875 ret = exofs_sbi_read(ios); 906 ret = exofs_sbi_read(ios);
876 if (ret) 907 if (unlikely(ret)) {
908 EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
909 _LLU(ios->obj.id), ret);
910 memset(inode, 0, sizeof(*inode));
911 inode->i_mode = 0040000 | (0777 & ~022);
912 /* If object is lost on target we might as well enable it's
913 * delete.
914 */
915 if ((ret == -ENOENT) || (ret == -EINVAL))
916 ret = 0;
877 goto out; 917 goto out;
918 }
878 919
879 ret = extract_attr_from_ios(ios, &attrs[0]); 920 ret = extract_attr_from_ios(ios, &attrs[0]);
880 if (ret) { 921 if (ret) {
@@ -886,11 +927,33 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
886 927
887 ret = extract_attr_from_ios(ios, &attrs[1]); 928 ret = extract_attr_from_ios(ios, &attrs[1]);
888 if (ret) { 929 if (ret) {
889 EXOFS_ERR("%s: extract_attr of logical_length failed\n", 930 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
890 __func__);
891 goto out; 931 goto out;
892 } 932 }
893 *obj_size = get_unaligned_be64(attrs[1].val_ptr); 933 if (attrs[1].len) {
934 layout = attrs[1].val_ptr;
935 if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
936 EXOFS_ERR("%s: unsupported files layout %d\n",
937 __func__, layout->gen_func);
938 ret = -ENOTSUPP;
939 goto out;
940 }
941 }
942
943 ret = extract_attr_from_ios(ios, &attrs[2]);
944 if (ret) {
945 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
946 goto out;
947 }
948 if (attrs[2].len) {
949 layout = attrs[2].val_ptr;
950 if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
951 EXOFS_ERR("%s: unsupported meta-data layout %d\n",
952 __func__, layout->gen_func);
953 ret = -ENOTSUPP;
954 goto out;
955 }
956 }
894 957
895out: 958out:
896 exofs_put_io_state(ios); 959 exofs_put_io_state(ios);
@@ -910,7 +973,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
910 struct exofs_i_info *oi; 973 struct exofs_i_info *oi;
911 struct exofs_fcb fcb; 974 struct exofs_fcb fcb;
912 struct inode *inode; 975 struct inode *inode;
913 uint64_t obj_size;
914 int ret; 976 int ret;
915 977
916 inode = iget_locked(sb, ino); 978 inode = iget_locked(sb, ino);
@@ -922,7 +984,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
922 __oi_init(oi); 984 __oi_init(oi);
923 985
924 /* read the inode from the osd */ 986 /* read the inode from the osd */
925 ret = exofs_get_inode(sb, oi, &fcb, &obj_size); 987 ret = exofs_get_inode(sb, oi, &fcb);
926 if (ret) 988 if (ret)
927 goto bad_inode; 989 goto bad_inode;
928 990
@@ -943,13 +1005,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
943 inode->i_blkbits = EXOFS_BLKSHIFT; 1005 inode->i_blkbits = EXOFS_BLKSHIFT;
944 inode->i_generation = le32_to_cpu(fcb.i_generation); 1006 inode->i_generation = le32_to_cpu(fcb.i_generation);
945 1007
946 if ((inode->i_size != obj_size) &&
947 (!exofs_inode_is_fast_symlink(inode))) {
948 EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n",
949 inode->i_size, _LLU(obj_size));
950 /* FIXME: call exofs_inode_recovery() */
951 }
952
953 oi->i_dir_start_lookup = 0; 1008 oi->i_dir_start_lookup = 0;
954 1009
955 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { 1010 if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
@@ -1028,7 +1083,7 @@ static void create_done(struct exofs_io_state *ios, void *p)
1028 1083
1029 if (unlikely(ret)) { 1084 if (unlikely(ret)) {
1030 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", 1085 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
1031 _LLU(exofs_oi_objno(oi)), _LLU(sbi->s_pid)); 1086 _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
1032 /*TODO: When FS is corrupted creation can fail, object already 1087 /*TODO: When FS is corrupted creation can fail, object already
1033 * exist. Get rid of this asynchronous creation, if exist 1088 * exist. Get rid of this asynchronous creation, if exist
1034 * increment the obj counter and try the next object. Until we 1089 * increment the obj counter and try the next object. Until we
@@ -1089,7 +1144,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1089 1144
1090 mark_inode_dirty(inode); 1145 mark_inode_dirty(inode);
1091 1146
1092 ret = exofs_get_io_state(sbi, &ios); 1147 ret = exofs_get_io_state(&sbi->layout, &ios);
1093 if (unlikely(ret)) { 1148 if (unlikely(ret)) {
1094 EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); 1149 EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n");
1095 return ERR_PTR(ret); 1150 return ERR_PTR(ret);
@@ -1155,8 +1210,10 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1155 int ret; 1210 int ret;
1156 1211
1157 args = kzalloc(sizeof(*args), GFP_KERNEL); 1212 args = kzalloc(sizeof(*args), GFP_KERNEL);
1158 if (!args) 1213 if (!args) {
1214 EXOFS_DBGMSG("Faild kzalloc of args\n");
1159 return -ENOMEM; 1215 return -ENOMEM;
1216 }
1160 1217
1161 fcb = &args->fcb; 1218 fcb = &args->fcb;
1162 1219
@@ -1185,7 +1242,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1185 } else 1242 } else
1186 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); 1243 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
1187 1244
1188 ret = exofs_get_io_state(sbi, &ios); 1245 ret = exofs_get_io_state(&sbi->layout, &ios);
1189 if (unlikely(ret)) { 1246 if (unlikely(ret)) {
1190 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 1247 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
1191 goto free_args; 1248 goto free_args;
@@ -1219,13 +1276,14 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1219free_args: 1276free_args:
1220 kfree(args); 1277 kfree(args);
1221out: 1278out:
1222 EXOFS_DBGMSG("ret=>%d\n", ret); 1279 EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n",
1280 inode->i_ino, do_sync, ret);
1223 return ret; 1281 return ret;
1224} 1282}
1225 1283
1226int exofs_write_inode(struct inode *inode, int wait) 1284int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
1227{ 1285{
1228 return exofs_update_inode(inode, wait); 1286 return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1229} 1287}
1230 1288
1231/* 1289/*
@@ -1268,7 +1326,7 @@ void exofs_delete_inode(struct inode *inode)
1268 1326
1269 clear_inode(inode); 1327 clear_inode(inode);
1270 1328
1271 ret = exofs_get_io_state(sbi, &ios); 1329 ret = exofs_get_io_state(&sbi->layout, &ios);
1272 if (unlikely(ret)) { 1330 if (unlikely(ret)) {
1273 EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); 1331 EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
1274 return; 1332 return;
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 5bad01fa1f9f..4337cad7777b 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -22,10 +22,15 @@
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 */ 23 */
24 24
25#include <linux/slab.h>
25#include <scsi/scsi_device.h> 26#include <scsi/scsi_device.h>
27#include <asm/div64.h>
26 28
27#include "exofs.h" 29#include "exofs.h"
28 30
31#define EXOFS_DBGMSG2(M...) do {} while (0)
32/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */
33
29void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) 34void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
30{ 35{
31 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); 36 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
@@ -64,21 +69,24 @@ out:
64 return ret; 69 return ret;
65} 70}
66 71
67int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios) 72int exofs_get_io_state(struct exofs_layout *layout,
73 struct exofs_io_state **pios)
68{ 74{
69 struct exofs_io_state *ios; 75 struct exofs_io_state *ios;
70 76
71 /*TODO: Maybe use kmem_cach per sbi of size 77 /*TODO: Maybe use kmem_cach per sbi of size
72 * exofs_io_state_size(sbi->s_numdevs) 78 * exofs_io_state_size(layout->s_numdevs)
73 */ 79 */
74 ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL); 80 ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
75 if (unlikely(!ios)) { 81 if (unlikely(!ios)) {
82 EXOFS_DBGMSG("Faild kzalloc bytes=%d\n",
83 exofs_io_state_size(layout->s_numdevs));
76 *pios = NULL; 84 *pios = NULL;
77 return -ENOMEM; 85 return -ENOMEM;
78 } 86 }
79 87
80 ios->sbi = sbi; 88 ios->layout = layout;
81 ios->obj.partition = sbi->s_pid; 89 ios->obj.partition = layout->s_pid;
82 *pios = ios; 90 *pios = ios;
83 return 0; 91 return 0;
84} 92}
@@ -101,6 +109,29 @@ void exofs_put_io_state(struct exofs_io_state *ios)
101 } 109 }
102} 110}
103 111
112unsigned exofs_layout_od_id(struct exofs_layout *layout,
113 osd_id obj_no, unsigned layout_index)
114{
115/* switch (layout->lay_func) {
116 case LAYOUT_MOVING_WINDOW:
117 {*/
118 unsigned dev_mod = obj_no;
119
120 return (layout_index + dev_mod * layout->mirrors_p1) %
121 layout->s_numdevs;
122/* }
123 case LAYOUT_FUNC_IMPLICT:
124 return layout->devs[layout_index];
125 }*/
126}
127
128static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios,
129 unsigned layout_index)
130{
131 return ios->layout->s_ods[
132 exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)];
133}
134
104static void _sync_done(struct exofs_io_state *ios, void *p) 135static void _sync_done(struct exofs_io_state *ios, void *p)
105{ 136{
106 struct completion *waiting = p; 137 struct completion *waiting = p;
@@ -168,6 +199,21 @@ static int exofs_io_execute(struct exofs_io_state *ios)
168 return ret; 199 return ret;
169} 200}
170 201
202static void _clear_bio(struct bio *bio)
203{
204 struct bio_vec *bv;
205 unsigned i;
206
207 __bio_for_each_segment(bv, bio, i, 0) {
208 unsigned this_count = bv->bv_len;
209
210 if (likely(PAGE_SIZE == this_count))
211 clear_highpage(bv->bv_page);
212 else
213 zero_user(bv->bv_page, bv->bv_offset, this_count);
214 }
215}
216
171int exofs_check_io(struct exofs_io_state *ios, u64 *resid) 217int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
172{ 218{
173 enum osd_err_priority acumulated_osd_err = 0; 219 enum osd_err_priority acumulated_osd_err = 0;
@@ -176,16 +222,25 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
176 222
177 for (i = 0; i < ios->numdevs; i++) { 223 for (i = 0; i < ios->numdevs; i++) {
178 struct osd_sense_info osi; 224 struct osd_sense_info osi;
179 int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi); 225 struct osd_request *or = ios->per_dev[i].or;
226 int ret;
227
228 if (unlikely(!or))
229 continue;
180 230
231 ret = osd_req_decode_sense(or, &osi);
181 if (likely(!ret)) 232 if (likely(!ret))
182 continue; 233 continue;
183 234
184 if (unlikely(ret == -EFAULT)) { 235 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
185 EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__); 236 /* start read offset passed endof file */
186 /*FIXME: All the pages in this device range should: 237 _clear_bio(ios->per_dev[i].bio);
187 * clear_highpage(page); 238 EXOFS_DBGMSG("start read offset passed end of file "
188 */ 239 "offset=0x%llx, length=0x%llx\n",
240 _LLU(ios->per_dev[i].offset),
241 _LLU(ios->per_dev[i].length));
242
243 continue; /* we recovered */
189 } 244 }
190 245
191 if (osi.osd_err_pri >= acumulated_osd_err) { 246 if (osi.osd_err_pri >= acumulated_osd_err) {
@@ -205,14 +260,259 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
205 return acumulated_lin_err; 260 return acumulated_lin_err;
206} 261}
207 262
263/*
264 * L - logical offset into the file
265 *
266 * U - The number of bytes in a stripe within a group
267 *
268 * U = stripe_unit * group_width
269 *
270 * T - The number of bytes striped within a group of component objects
271 * (before advancing to the next group)
272 *
273 * T = stripe_unit * group_width * group_depth
274 *
275 * S - The number of bytes striped across all component objects
276 * before the pattern repeats
277 *
278 * S = stripe_unit * group_width * group_depth * group_count
279 *
280 * M - The "major" (i.e., across all components) stripe number
281 *
282 * M = L / S
283 *
284 * G - Counts the groups from the beginning of the major stripe
285 *
286 * G = (L - (M * S)) / T [or (L % S) / T]
287 *
288 * H - The byte offset within the group
289 *
290 * H = (L - (M * S)) % T [or (L % S) % T]
291 *
292 * N - The "minor" (i.e., across the group) stripe number
293 *
294 * N = H / U
295 *
296 * C - The component index coresponding to L
297 *
298 * C = (H - (N * U)) / stripe_unit + G * group_width
299 * [or (L % U) / stripe_unit + G * group_width]
300 *
301 * O - The component offset coresponding to L
302 *
303 * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
304 */
305struct _striping_info {
306 u64 obj_offset;
307 u64 group_length;
308 u64 total_group_length;
309 u64 Major;
310 unsigned dev;
311 unsigned unit_off;
312};
313
314static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
315 struct _striping_info *si)
316{
317 u32 stripe_unit = ios->layout->stripe_unit;
318 u32 group_width = ios->layout->group_width;
319 u64 group_depth = ios->layout->group_depth;
320
321 u32 U = stripe_unit * group_width;
322 u64 T = U * group_depth;
323 u64 S = T * ios->layout->group_count;
324 u64 M = div64_u64(file_offset, S);
325
326 /*
327 G = (L - (M * S)) / T
328 H = (L - (M * S)) % T
329 */
330 u64 LmodS = file_offset - M * S;
331 u32 G = div64_u64(LmodS, T);
332 u64 H = LmodS - G * T;
333
334 u32 N = div_u64(H, U);
335
336 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
337 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
338 si->dev *= ios->layout->mirrors_p1;
339
340 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
341
342 si->obj_offset = si->unit_off + (N * stripe_unit) +
343 (M * group_depth * stripe_unit);
344
345 si->group_length = T - H;
346 si->total_group_length = T;
347 si->Major = M;
348}
349
350static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
351 unsigned pgbase, struct exofs_per_dev_state *per_dev,
352 int cur_len)
353{
354 unsigned pg = *cur_pg;
355 struct request_queue *q =
356 osd_request_queue(exofs_ios_od(ios, per_dev->dev));
357
358 per_dev->length += cur_len;
359
360 if (per_dev->bio == NULL) {
361 unsigned pages_in_stripe = ios->layout->group_width *
362 (ios->layout->stripe_unit / PAGE_SIZE);
363 unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
364 ios->layout->group_width;
365
366 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
367 if (unlikely(!per_dev->bio)) {
368 EXOFS_DBGMSG("Faild to allocate BIO size=%u\n",
369 bio_size);
370 return -ENOMEM;
371 }
372 }
373
374 while (cur_len > 0) {
375 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
376 unsigned added_len;
377
378 BUG_ON(ios->nr_pages <= pg);
379 cur_len -= pglen;
380
381 added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
382 pglen, pgbase);
383 if (unlikely(pglen != added_len))
384 return -ENOMEM;
385 pgbase = 0;
386 ++pg;
387 }
388 BUG_ON(cur_len);
389
390 *cur_pg = pg;
391 return 0;
392}
393
394static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
395 struct _striping_info *si, unsigned first_comp)
396{
397 unsigned stripe_unit = ios->layout->stripe_unit;
398 unsigned mirrors_p1 = ios->layout->mirrors_p1;
399 unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
400 unsigned dev = si->dev;
401 unsigned first_dev = dev - (dev % devs_in_group);
402 unsigned comp = first_comp + (dev - first_dev);
403 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
404 unsigned cur_pg = ios->pages_consumed;
405 int ret = 0;
406
407 while (length) {
408 struct exofs_per_dev_state *per_dev = &ios->per_dev[comp];
409 unsigned cur_len, page_off = 0;
410
411 if (!per_dev->length) {
412 per_dev->dev = dev;
413 if (dev < si->dev) {
414 per_dev->offset = si->obj_offset + stripe_unit -
415 si->unit_off;
416 cur_len = stripe_unit;
417 } else if (dev == si->dev) {
418 per_dev->offset = si->obj_offset;
419 cur_len = stripe_unit - si->unit_off;
420 page_off = si->unit_off & ~PAGE_MASK;
421 BUG_ON(page_off && (page_off != ios->pgbase));
422 } else { /* dev > si->dev */
423 per_dev->offset = si->obj_offset - si->unit_off;
424 cur_len = stripe_unit;
425 }
426
427 if (max_comp < comp)
428 max_comp = comp;
429
430 dev += mirrors_p1;
431 dev = (dev % devs_in_group) + first_dev;
432 } else {
433 cur_len = stripe_unit;
434 }
435 if (cur_len >= length)
436 cur_len = length;
437
438 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
439 cur_len);
440 if (unlikely(ret))
441 goto out;
442
443 comp += mirrors_p1;
444 comp = (comp % devs_in_group) + first_comp;
445
446 length -= cur_len;
447 }
448out:
449 ios->numdevs = max_comp + mirrors_p1;
450 ios->pages_consumed = cur_pg;
451 return ret;
452}
453
454static int _prepare_for_striping(struct exofs_io_state *ios)
455{
456 u64 length = ios->length;
457 struct _striping_info si;
458 unsigned devs_in_group = ios->layout->group_width *
459 ios->layout->mirrors_p1;
460 unsigned first_comp = 0;
461 int ret = 0;
462
463 _calc_stripe_info(ios, ios->offset, &si);
464
465 if (!ios->pages) {
466 if (ios->kern_buff) {
467 struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
468
469 per_dev->offset = si.obj_offset;
470 per_dev->dev = si.dev;
471
472 /* no cross device without page array */
473 BUG_ON((ios->layout->group_width > 1) &&
474 (si.unit_off + ios->length >
475 ios->layout->stripe_unit));
476 }
477 ios->numdevs = ios->layout->mirrors_p1;
478 return 0;
479 }
480
481 while (length) {
482 if (length < si.group_length)
483 si.group_length = length;
484
485 ret = _prepare_one_group(ios, si.group_length, &si, first_comp);
486 if (unlikely(ret))
487 goto out;
488
489 length -= si.group_length;
490
491 si.group_length = si.total_group_length;
492 si.unit_off = 0;
493 ++si.Major;
494 si.obj_offset = si.Major * ios->layout->stripe_unit *
495 ios->layout->group_depth;
496
497 si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
498 si.dev %= ios->layout->s_numdevs;
499
500 first_comp += devs_in_group;
501 first_comp %= ios->layout->s_numdevs;
502 }
503
504out:
505 return ret;
506}
507
208int exofs_sbi_create(struct exofs_io_state *ios) 508int exofs_sbi_create(struct exofs_io_state *ios)
209{ 509{
210 int i, ret; 510 int i, ret;
211 511
212 for (i = 0; i < ios->sbi->s_numdevs; i++) { 512 for (i = 0; i < ios->layout->s_numdevs; i++) {
213 struct osd_request *or; 513 struct osd_request *or;
214 514
215 or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); 515 or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
216 if (unlikely(!or)) { 516 if (unlikely(!or)) {
217 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 517 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
218 ret = -ENOMEM; 518 ret = -ENOMEM;
@@ -233,10 +533,10 @@ int exofs_sbi_remove(struct exofs_io_state *ios)
233{ 533{
234 int i, ret; 534 int i, ret;
235 535
236 for (i = 0; i < ios->sbi->s_numdevs; i++) { 536 for (i = 0; i < ios->layout->s_numdevs; i++) {
237 struct osd_request *or; 537 struct osd_request *or;
238 538
239 or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); 539 or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
240 if (unlikely(!or)) { 540 if (unlikely(!or)) {
241 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 541 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
242 ret = -ENOMEM; 542 ret = -ENOMEM;
@@ -253,51 +553,74 @@ out:
253 return ret; 553 return ret;
254} 554}
255 555
256int exofs_sbi_write(struct exofs_io_state *ios) 556static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
257{ 557{
258 int i, ret; 558 struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp];
559 unsigned dev = ios->per_dev[cur_comp].dev;
560 unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
561 int ret = 0;
259 562
260 for (i = 0; i < ios->sbi->s_numdevs; i++) { 563 if (ios->pages && !master_dev->length)
564 return 0; /* Just an empty slot */
565
566 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
567 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
261 struct osd_request *or; 568 struct osd_request *or;
262 569
263 or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL); 570 or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL);
264 if (unlikely(!or)) { 571 if (unlikely(!or)) {
265 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 572 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
266 ret = -ENOMEM; 573 ret = -ENOMEM;
267 goto out; 574 goto out;
268 } 575 }
269 ios->per_dev[i].or = or; 576 per_dev->or = or;
270 ios->numdevs++; 577 per_dev->offset = master_dev->offset;
271 578
272 if (ios->bio) { 579 if (ios->pages) {
273 struct bio *bio; 580 struct bio *bio;
274 581
275 if (i != 0) { 582 if (per_dev != master_dev) {
276 bio = bio_kmalloc(GFP_KERNEL, 583 bio = bio_kmalloc(GFP_KERNEL,
277 ios->bio->bi_max_vecs); 584 master_dev->bio->bi_max_vecs);
278 if (unlikely(!bio)) { 585 if (unlikely(!bio)) {
586 EXOFS_DBGMSG(
587 "Faild to allocate BIO size=%u\n",
588 master_dev->bio->bi_max_vecs);
279 ret = -ENOMEM; 589 ret = -ENOMEM;
280 goto out; 590 goto out;
281 } 591 }
282 592
283 __bio_clone(bio, ios->bio); 593 __bio_clone(bio, master_dev->bio);
284 bio->bi_bdev = NULL; 594 bio->bi_bdev = NULL;
285 bio->bi_next = NULL; 595 bio->bi_next = NULL;
286 ios->per_dev[i].bio = bio; 596 per_dev->length = master_dev->length;
597 per_dev->bio = bio;
598 per_dev->dev = dev;
287 } else { 599 } else {
288 bio = ios->bio; 600 bio = master_dev->bio;
601 /* FIXME: bio_set_dir() */
602 bio->bi_rw |= (1 << BIO_RW);
289 } 603 }
290 604
291 osd_req_write(or, &ios->obj, ios->offset, bio, 605 osd_req_write(or, &ios->obj, per_dev->offset, bio,
292 ios->length); 606 per_dev->length);
293/* EXOFS_DBGMSG("write sync=%d\n", sync);*/ 607 EXOFS_DBGMSG("write(0x%llx) offset=0x%llx "
608 "length=0x%llx dev=%d\n",
609 _LLU(ios->obj.id), _LLU(per_dev->offset),
610 _LLU(per_dev->length), dev);
294 } else if (ios->kern_buff) { 611 } else if (ios->kern_buff) {
295 osd_req_write_kern(or, &ios->obj, ios->offset, 612 ret = osd_req_write_kern(or, &ios->obj, per_dev->offset,
296 ios->kern_buff, ios->length); 613 ios->kern_buff, ios->length);
297/* EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/ 614 if (unlikely(ret))
615 goto out;
616 EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
617 "length=0x%llx dev=%d\n",
618 _LLU(ios->obj.id), _LLU(per_dev->offset),
619 _LLU(ios->length), dev);
298 } else { 620 } else {
299 osd_req_set_attributes(or, &ios->obj); 621 osd_req_set_attributes(or, &ios->obj);
300/* EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/ 622 EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
623 _LLU(ios->obj.id), ios->out_attr_len, dev);
301 } 624 }
302 625
303 if (ios->out_attr) 626 if (ios->out_attr)
@@ -308,54 +631,93 @@ int exofs_sbi_write(struct exofs_io_state *ios)
308 osd_req_add_get_attr_list(or, ios->in_attr, 631 osd_req_add_get_attr_list(or, ios->in_attr,
309 ios->in_attr_len); 632 ios->in_attr_len);
310 } 633 }
311 ret = exofs_io_execute(ios);
312 634
313out: 635out:
314 return ret; 636 return ret;
315} 637}
316 638
317int exofs_sbi_read(struct exofs_io_state *ios) 639int exofs_sbi_write(struct exofs_io_state *ios)
318{ 640{
319 int i, ret; 641 int i;
642 int ret;
320 643
321 for (i = 0; i < 1; i++) { 644 ret = _prepare_for_striping(ios);
322 struct osd_request *or; 645 if (unlikely(ret))
323 unsigned first_dev = (unsigned)ios->obj.id; 646 return ret;
324 647
325 first_dev %= ios->sbi->s_numdevs; 648 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
326 or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL); 649 ret = _sbi_write_mirror(ios, i);
327 if (unlikely(!or)) { 650 if (unlikely(ret))
328 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 651 return ret;
329 ret = -ENOMEM; 652 }
330 goto out;
331 }
332 ios->per_dev[i].or = or;
333 ios->numdevs++;
334 653
335 if (ios->bio) { 654 ret = exofs_io_execute(ios);
336 osd_req_read(or, &ios->obj, ios->offset, ios->bio, 655 return ret;
337 ios->length); 656}
338/* EXOFS_DBGMSG("read sync=%d\n", sync);*/
339 } else if (ios->kern_buff) {
340 osd_req_read_kern(or, &ios->obj, ios->offset,
341 ios->kern_buff, ios->length);
342/* EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/
343 } else {
344 osd_req_get_attributes(or, &ios->obj);
345/* EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/
346 }
347 657
348 if (ios->out_attr) 658static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
349 osd_req_add_set_attr_list(or, ios->out_attr, 659{
350 ios->out_attr_len); 660 struct osd_request *or;
661 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
662 unsigned first_dev = (unsigned)ios->obj.id;
351 663
352 if (ios->in_attr) 664 if (ios->pages && !per_dev->length)
353 osd_req_add_get_attr_list(or, ios->in_attr, 665 return 0; /* Just an empty slot */
354 ios->in_attr_len); 666
667 first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
668 or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL);
669 if (unlikely(!or)) {
670 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
671 return -ENOMEM;
355 } 672 }
356 ret = exofs_io_execute(ios); 673 per_dev->or = or;
674
675 if (ios->pages) {
676 osd_req_read(or, &ios->obj, per_dev->offset,
677 per_dev->bio, per_dev->length);
678 EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
679 " dev=%d\n", _LLU(ios->obj.id),
680 _LLU(per_dev->offset), _LLU(per_dev->length),
681 first_dev);
682 } else if (ios->kern_buff) {
683 int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset,
684 ios->kern_buff, ios->length);
685 EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
686 "length=0x%llx dev=%d ret=>%d\n",
687 _LLU(ios->obj.id), _LLU(per_dev->offset),
688 _LLU(ios->length), first_dev, ret);
689 if (unlikely(ret))
690 return ret;
691 } else {
692 osd_req_get_attributes(or, &ios->obj);
693 EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
694 _LLU(ios->obj.id), ios->in_attr_len, first_dev);
695 }
696 if (ios->out_attr)
697 osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
357 698
358out: 699 if (ios->in_attr)
700 osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
701
702 return 0;
703}
704
705int exofs_sbi_read(struct exofs_io_state *ios)
706{
707 int i;
708 int ret;
709
710 ret = _prepare_for_striping(ios);
711 if (unlikely(ret))
712 return ret;
713
714 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
715 ret = _sbi_read_mirror(ios, i);
716 if (unlikely(ret))
717 return ret;
718 }
719
720 ret = exofs_io_execute(ios);
359 return ret; 721 return ret;
360} 722}
361 723
@@ -380,42 +742,82 @@ int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
380 return -EIO; 742 return -EIO;
381} 743}
382 744
745static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp,
746 struct osd_attr *attr)
747{
748 int last_comp = cur_comp + ios->layout->mirrors_p1;
749
750 for (; cur_comp < last_comp; ++cur_comp) {
751 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
752 struct osd_request *or;
753
754 or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL);
755 if (unlikely(!or)) {
756 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
757 return -ENOMEM;
758 }
759 per_dev->or = or;
760
761 osd_req_set_attributes(or, &ios->obj);
762 osd_req_add_set_attr_list(or, attr, 1);
763 }
764
765 return 0;
766}
767
383int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) 768int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
384{ 769{
385 struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; 770 struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
386 struct exofs_io_state *ios; 771 struct exofs_io_state *ios;
387 struct osd_attr attr; 772 struct exofs_trunc_attr {
388 __be64 newsize; 773 struct osd_attr attr;
774 __be64 newsize;
775 } *size_attrs;
776 struct _striping_info si;
389 int i, ret; 777 int i, ret;
390 778
391 if (exofs_get_io_state(sbi, &ios)) 779 ret = exofs_get_io_state(&sbi->layout, &ios);
392 return -ENOMEM; 780 if (unlikely(ret))
781 return ret;
782
783 size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs),
784 GFP_KERNEL);
785 if (unlikely(!size_attrs)) {
786 ret = -ENOMEM;
787 goto out;
788 }
393 789
394 ios->obj.id = exofs_oi_objno(oi); 790 ios->obj.id = exofs_oi_objno(oi);
395 ios->cred = oi->i_cred; 791 ios->cred = oi->i_cred;
396 792
397 newsize = cpu_to_be64(size); 793 ios->numdevs = ios->layout->s_numdevs;
398 attr = g_attr_logical_length; 794 _calc_stripe_info(ios, size, &si);
399 attr.val_ptr = &newsize;
400 795
401 for (i = 0; i < sbi->s_numdevs; i++) { 796 for (i = 0; i < ios->layout->group_width; ++i) {
402 struct osd_request *or; 797 struct exofs_trunc_attr *size_attr = &size_attrs[i];
798 u64 obj_size;
403 799
404 or = osd_start_request(sbi->s_ods[i], GFP_KERNEL); 800 if (i < si.dev)
405 if (unlikely(!or)) { 801 obj_size = si.obj_offset +
406 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 802 ios->layout->stripe_unit - si.unit_off;
407 ret = -ENOMEM; 803 else if (i == si.dev)
408 goto out; 804 obj_size = si.obj_offset;
409 } 805 else /* i > si.dev */
410 ios->per_dev[i].or = or; 806 obj_size = si.obj_offset - si.unit_off;
411 ios->numdevs++;
412 807
413 osd_req_set_attributes(or, &ios->obj); 808 size_attr->newsize = cpu_to_be64(obj_size);
414 osd_req_add_set_attr_list(or, &attr, 1); 809 size_attr->attr = g_attr_logical_length;
810 size_attr->attr.val_ptr = &size_attr->newsize;
811
812 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
813 &size_attr->attr);
814 if (unlikely(ret))
815 goto out;
415 } 816 }
416 ret = exofs_io_execute(ios); 817 ret = exofs_io_execute(ios);
417 818
418out: 819out:
820 kfree(size_attrs);
419 exofs_put_io_state(ios); 821 exofs_put_io_state(ios);
420 return ret; 822 return ret;
421} 823}
diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h
index 423033addd1f..c52e9888b8ab 100644
--- a/fs/exofs/pnfs.h
+++ b/fs/exofs/pnfs.h
@@ -15,13 +15,7 @@
15#ifndef __EXOFS_PNFS_H__ 15#ifndef __EXOFS_PNFS_H__
16#define __EXOFS_PNFS_H__ 16#define __EXOFS_PNFS_H__
17 17
18#if defined(CONFIG_PNFS) 18#if ! defined(__PNFS_OSD_XDR_H__)
19
20
21/* FIXME: move this file to: linux/exportfs/pnfs_osd_xdr.h */
22#include "../nfs/objlayout/pnfs_osd_xdr.h"
23
24#else /* defined(CONFIG_PNFS) */
25 19
26enum pnfs_iomode { 20enum pnfs_iomode {
27 IOMODE_READ = 1, 21 IOMODE_READ = 1,
@@ -46,6 +40,6 @@ struct pnfs_osd_data_map {
46 u32 odm_raid_algorithm; 40 u32 odm_raid_algorithm;
47}; 41};
48 42
49#endif /* else defined(CONFIG_PNFS) */ 43#endif /* ! defined(__PNFS_OSD_XDR_H__) */
50 44
51#endif /* __EXOFS_PNFS_H__ */ 45#endif /* __EXOFS_PNFS_H__ */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index a1d1e77b12eb..03149b9a5178 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -37,6 +37,7 @@
37#include <linux/vfs.h> 37#include <linux/vfs.h>
38#include <linux/random.h> 38#include <linux/random.h>
39#include <linux/exportfs.h> 39#include <linux/exportfs.h>
40#include <linux/slab.h>
40 41
41#include "exofs.h" 42#include "exofs.h"
42 43
@@ -210,7 +211,7 @@ int exofs_sync_fs(struct super_block *sb, int wait)
210 sbi = sb->s_fs_info; 211 sbi = sb->s_fs_info;
211 fscb = &sbi->s_fscb; 212 fscb = &sbi->s_fscb;
212 213
213 ret = exofs_get_io_state(sbi, &ios); 214 ret = exofs_get_io_state(&sbi->layout, &ios);
214 if (ret) 215 if (ret)
215 goto out; 216 goto out;
216 217
@@ -264,12 +265,12 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
264 265
265void exofs_free_sbi(struct exofs_sb_info *sbi) 266void exofs_free_sbi(struct exofs_sb_info *sbi)
266{ 267{
267 while (sbi->s_numdevs) { 268 while (sbi->layout.s_numdevs) {
268 int i = --sbi->s_numdevs; 269 int i = --sbi->layout.s_numdevs;
269 struct osd_dev *od = sbi->s_ods[i]; 270 struct osd_dev *od = sbi->layout.s_ods[i];
270 271
271 if (od) { 272 if (od) {
272 sbi->s_ods[i] = NULL; 273 sbi->layout.s_ods[i] = NULL;
273 osduld_put_device(od); 274 osduld_put_device(od);
274 } 275 }
275 } 276 }
@@ -298,8 +299,10 @@ static void exofs_put_super(struct super_block *sb)
298 msecs_to_jiffies(100)); 299 msecs_to_jiffies(100));
299 } 300 }
300 301
301 _exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid); 302 _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0],
303 sbi->layout.s_pid);
302 304
305 bdi_destroy(&sbi->bdi);
303 exofs_free_sbi(sbi); 306 exofs_free_sbi(sbi);
304 sb->s_fs_info = NULL; 307 sb->s_fs_info = NULL;
305} 308}
@@ -307,6 +310,8 @@ static void exofs_put_super(struct super_block *sb)
307static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, 310static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
308 struct exofs_device_table *dt) 311 struct exofs_device_table *dt)
309{ 312{
313 u64 stripe_length;
314
310 sbi->data_map.odm_num_comps = 315 sbi->data_map.odm_num_comps =
311 le32_to_cpu(dt->dt_data_map.cb_num_comps); 316 le32_to_cpu(dt->dt_data_map.cb_num_comps);
312 sbi->data_map.odm_stripe_unit = 317 sbi->data_map.odm_stripe_unit =
@@ -320,14 +325,63 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
320 sbi->data_map.odm_raid_algorithm = 325 sbi->data_map.odm_raid_algorithm =
321 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); 326 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
322 327
323/* FIXME: Hard coded mirror only for now. if not so do not mount */ 328/* FIXME: Only raid0 for now. if not so, do not mount */
324 if ((sbi->data_map.odm_num_comps != numdevs) || 329 if (sbi->data_map.odm_num_comps != numdevs) {
325 (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) || 330 EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
326 (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) || 331 sbi->data_map.odm_num_comps, numdevs);
327 (sbi->data_map.odm_mirror_cnt != (numdevs - 1)))
328 return -EINVAL; 332 return -EINVAL;
329 else 333 }
330 return 0; 334 if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
335 EXOFS_ERR("Only RAID_0 for now\n");
336 return -EINVAL;
337 }
338 if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
339 EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
340 numdevs, sbi->data_map.odm_mirror_cnt);
341 return -EINVAL;
342 }
343
344 if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
345 EXOFS_ERR("Stripe Unit(0x%llx)"
346 " must be Multples of PAGE_SIZE(0x%lx)\n",
347 _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
348 return -EINVAL;
349 }
350
351 sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
352 sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
353
354 if (sbi->data_map.odm_group_width) {
355 sbi->layout.group_width = sbi->data_map.odm_group_width;
356 sbi->layout.group_depth = sbi->data_map.odm_group_depth;
357 if (!sbi->layout.group_depth) {
358 EXOFS_ERR("group_depth == 0 && group_width != 0\n");
359 return -EINVAL;
360 }
361 sbi->layout.group_count = sbi->data_map.odm_num_comps /
362 sbi->layout.mirrors_p1 /
363 sbi->data_map.odm_group_width;
364 } else {
365 if (sbi->data_map.odm_group_depth) {
366 printk(KERN_NOTICE "Warning: group_depth ignored "
367 "group_width == 0 && group_depth == %d\n",
368 sbi->data_map.odm_group_depth);
369 sbi->data_map.odm_group_depth = 0;
370 }
371 sbi->layout.group_width = sbi->data_map.odm_num_comps /
372 sbi->layout.mirrors_p1;
373 sbi->layout.group_depth = -1;
374 sbi->layout.group_count = 1;
375 }
376
377 stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
378 if (stripe_length >= (1ULL << 32)) {
379 EXOFS_ERR("Total Stripe length(0x%llx)"
380 " >= 32bit is not supported\n", _LLU(stripe_length));
381 return -EINVAL;
382 }
383
384 return 0;
331} 385}
332 386
333/* @odi is valid only as long as @fscb_dev is valid */ 387/* @odi is valid only as long as @fscb_dev is valid */
@@ -361,7 +415,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
361{ 415{
362 struct exofs_sb_info *sbi = *psbi; 416 struct exofs_sb_info *sbi = *psbi;
363 struct osd_dev *fscb_od; 417 struct osd_dev *fscb_od;
364 struct osd_obj_id obj = {.partition = sbi->s_pid, 418 struct osd_obj_id obj = {.partition = sbi->layout.s_pid,
365 .id = EXOFS_DEVTABLE_ID}; 419 .id = EXOFS_DEVTABLE_ID};
366 struct exofs_device_table *dt; 420 struct exofs_device_table *dt;
367 unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + 421 unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
@@ -376,9 +430,9 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
376 return -ENOMEM; 430 return -ENOMEM;
377 } 431 }
378 432
379 fscb_od = sbi->s_ods[0]; 433 fscb_od = sbi->layout.s_ods[0];
380 sbi->s_ods[0] = NULL; 434 sbi->layout.s_ods[0] = NULL;
381 sbi->s_numdevs = 0; 435 sbi->layout.s_numdevs = 0;
382 ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes); 436 ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
383 if (unlikely(ret)) { 437 if (unlikely(ret)) {
384 EXOFS_ERR("ERROR: reading device table\n"); 438 EXOFS_ERR("ERROR: reading device table\n");
@@ -397,14 +451,15 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
397 goto out; 451 goto out;
398 452
399 if (likely(numdevs > 1)) { 453 if (likely(numdevs > 1)) {
400 unsigned size = numdevs * sizeof(sbi->s_ods[0]); 454 unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]);
401 455
402 sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL); 456 sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
403 if (unlikely(!sbi)) { 457 if (unlikely(!sbi)) {
404 ret = -ENOMEM; 458 ret = -ENOMEM;
405 goto out; 459 goto out;
406 } 460 }
407 memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0])); 461 memset(&sbi->layout.s_ods[1], 0,
462 size - sizeof(sbi->layout.s_ods[0]));
408 *psbi = sbi; 463 *psbi = sbi;
409 } 464 }
410 465
@@ -427,8 +482,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
427 * line. We always keep them in device-table order. 482 * line. We always keep them in device-table order.
428 */ 483 */
429 if (fscb_od && osduld_device_same(fscb_od, &odi)) { 484 if (fscb_od && osduld_device_same(fscb_od, &odi)) {
430 sbi->s_ods[i] = fscb_od; 485 sbi->layout.s_ods[i] = fscb_od;
431 ++sbi->s_numdevs; 486 ++sbi->layout.s_numdevs;
432 fscb_od = NULL; 487 fscb_od = NULL;
433 continue; 488 continue;
434 } 489 }
@@ -441,8 +496,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
441 goto out; 496 goto out;
442 } 497 }
443 498
444 sbi->s_ods[i] = od; 499 sbi->layout.s_ods[i] = od;
445 ++sbi->s_numdevs; 500 ++sbi->layout.s_numdevs;
446 501
447 /* Read the fscb of the other devices to make sure the FS 502 /* Read the fscb of the other devices to make sure the FS
448 * partition is there. 503 * partition is there.
@@ -492,6 +547,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
492 if (!sbi) 547 if (!sbi)
493 return -ENOMEM; 548 return -ENOMEM;
494 549
550 ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
551 if (ret)
552 goto free_bdi;
553
495 /* use mount options to fill superblock */ 554 /* use mount options to fill superblock */
496 od = osduld_path_lookup(opts->dev_name); 555 od = osduld_path_lookup(opts->dev_name);
497 if (IS_ERR(od)) { 556 if (IS_ERR(od)) {
@@ -499,9 +558,15 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
499 goto free_sbi; 558 goto free_sbi;
500 } 559 }
501 560
502 sbi->s_ods[0] = od; 561 /* Default layout in case we do not have a device-table */
503 sbi->s_numdevs = 1; 562 sbi->layout.stripe_unit = PAGE_SIZE;
504 sbi->s_pid = opts->pid; 563 sbi->layout.mirrors_p1 = 1;
564 sbi->layout.group_width = 1;
565 sbi->layout.group_depth = -1;
566 sbi->layout.group_count = 1;
567 sbi->layout.s_ods[0] = od;
568 sbi->layout.s_numdevs = 1;
569 sbi->layout.s_pid = opts->pid;
505 sbi->s_timeout = opts->timeout; 570 sbi->s_timeout = opts->timeout;
506 571
507 /* fill in some other data by hand */ 572 /* fill in some other data by hand */
@@ -514,7 +579,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
514 sb->s_bdev = NULL; 579 sb->s_bdev = NULL;
515 sb->s_dev = 0; 580 sb->s_dev = 0;
516 581
517 obj.partition = sbi->s_pid; 582 obj.partition = sbi->layout.s_pid;
518 obj.id = EXOFS_SUPER_ID; 583 obj.id = EXOFS_SUPER_ID;
519 exofs_make_credential(sbi->s_cred, &obj); 584 exofs_make_credential(sbi->s_cred, &obj);
520 585
@@ -552,6 +617,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
552 } 617 }
553 618
554 /* set up operation vectors */ 619 /* set up operation vectors */
620 sb->s_bdi = &sbi->bdi;
555 sb->s_fs_info = sbi; 621 sb->s_fs_info = sbi;
556 sb->s_op = &exofs_sops; 622 sb->s_op = &exofs_sops;
557 sb->s_export_op = &exofs_export_ops; 623 sb->s_export_op = &exofs_export_ops;
@@ -578,13 +644,15 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
578 goto free_sbi; 644 goto free_sbi;
579 } 645 }
580 646
581 _exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0], 647 _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0],
582 sbi->s_pid); 648 sbi->layout.s_pid);
583 return 0; 649 return 0;
584 650
585free_sbi: 651free_sbi:
652 bdi_destroy(&sbi->bdi);
653free_bdi:
586 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", 654 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
587 opts->dev_name, sbi->s_pid, ret); 655 opts->dev_name, sbi->layout.s_pid, ret);
588 exofs_free_sbi(sbi); 656 exofs_free_sbi(sbi);
589 return ret; 657 return ret;
590} 658}
@@ -627,7 +695,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
627 uint8_t cred_a[OSD_CAP_LEN]; 695 uint8_t cred_a[OSD_CAP_LEN];
628 int ret; 696 int ret;
629 697
630 ret = exofs_get_io_state(sbi, &ios); 698 ret = exofs_get_io_state(&sbi->layout, &ios);
631 if (ret) { 699 if (ret) {
632 EXOFS_DBGMSG("exofs_get_io_state failed.\n"); 700 EXOFS_DBGMSG("exofs_get_io_state failed.\n");
633 return ret; 701 return ret;
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 7f8d2e5a7ea6..3cf038c055d7 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -13,6 +13,7 @@
13 13
14#include "ext2.h" 14#include "ext2.h"
15#include <linux/quotaops.h> 15#include <linux/quotaops.h>
16#include <linux/slab.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17#include <linux/buffer_head.h> 18#include <linux/buffer_head.h>
18#include <linux/capability.h> 19#include <linux/capability.h>
@@ -570,7 +571,7 @@ do_more:
570error_return: 571error_return:
571 brelse(bitmap_bh); 572 brelse(bitmap_bh);
572 release_blocks(sb, freed); 573 release_blocks(sb, freed);
573 vfs_dq_free_block(inode, freed); 574 dquot_free_block(inode, freed);
574} 575}
575 576
576/** 577/**
@@ -1236,6 +1237,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
1236 unsigned short windowsz = 0; 1237 unsigned short windowsz = 0;
1237 unsigned long ngroups; 1238 unsigned long ngroups;
1238 unsigned long num = *count; 1239 unsigned long num = *count;
1240 int ret;
1239 1241
1240 *errp = -ENOSPC; 1242 *errp = -ENOSPC;
1241 sb = inode->i_sb; 1243 sb = inode->i_sb;
@@ -1247,8 +1249,9 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
1247 /* 1249 /*
1248 * Check quota for allocation of this block. 1250 * Check quota for allocation of this block.
1249 */ 1251 */
1250 if (vfs_dq_alloc_block(inode, num)) { 1252 ret = dquot_alloc_block(inode, num);
1251 *errp = -EDQUOT; 1253 if (ret) {
1254 *errp = ret;
1252 return 0; 1255 return 0;
1253 } 1256 }
1254 1257
@@ -1409,7 +1412,7 @@ allocated:
1409 1412
1410 *errp = 0; 1413 *errp = 0;
1411 brelse(bitmap_bh); 1414 brelse(bitmap_bh);
1412 vfs_dq_free_block(inode, *count-num); 1415 dquot_free_block(inode, *count-num);
1413 *count = num; 1416 *count = num;
1414 return ret_block; 1417 return ret_block;
1415 1418
@@ -1420,7 +1423,7 @@ out:
1420 * Undo the block allocation 1423 * Undo the block allocation
1421 */ 1424 */
1422 if (!performed_allocation) 1425 if (!performed_allocation)
1423 vfs_dq_free_block(inode, *count); 1426 dquot_free_block(inode, *count);
1424 brelse(bitmap_bh); 1427 brelse(bitmap_bh);
1425 return 0; 1428 return 0;
1426} 1429}
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 061914add3cf..0b038e47ad2f 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -118,7 +118,7 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
118 118
119/* inode.c */ 119/* inode.c */
120extern struct inode *ext2_iget (struct super_block *, unsigned long); 120extern struct inode *ext2_iget (struct super_block *, unsigned long);
121extern int ext2_write_inode (struct inode *, int); 121extern int ext2_write_inode (struct inode *, struct writeback_control *);
122extern void ext2_delete_inode (struct inode *); 122extern void ext2_delete_inode (struct inode *);
123extern int ext2_sync_inode (struct inode *); 123extern int ext2_sync_inode (struct inode *);
124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); 124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 586e3589d4c2..5d198d0697fb 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -20,6 +20,7 @@
20 20
21#include <linux/time.h> 21#include <linux/time.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/quotaops.h>
23#include "ext2.h" 24#include "ext2.h"
24#include "xattr.h" 25#include "xattr.h"
25#include "acl.h" 26#include "acl.h"
@@ -70,7 +71,7 @@ const struct file_operations ext2_file_operations = {
70 .compat_ioctl = ext2_compat_ioctl, 71 .compat_ioctl = ext2_compat_ioctl,
71#endif 72#endif
72 .mmap = generic_file_mmap, 73 .mmap = generic_file_mmap,
73 .open = generic_file_open, 74 .open = dquot_file_open,
74 .release = ext2_release_file, 75 .release = ext2_release_file,
75 .fsync = ext2_fsync, 76 .fsync = ext2_fsync,
76 .splice_read = generic_file_splice_read, 77 .splice_read = generic_file_splice_read,
@@ -87,7 +88,7 @@ const struct file_operations ext2_xip_file_operations = {
87 .compat_ioctl = ext2_compat_ioctl, 88 .compat_ioctl = ext2_compat_ioctl,
88#endif 89#endif
89 .mmap = xip_file_mmap, 90 .mmap = xip_file_mmap,
90 .open = generic_file_open, 91 .open = dquot_file_open,
91 .release = ext2_release_file, 92 .release = ext2_release_file,
92 .fsync = ext2_fsync, 93 .fsync = ext2_fsync,
93}; 94};
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 15387c9c17d8..ad7d572ee8dc 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -121,8 +121,8 @@ void ext2_free_inode (struct inode * inode)
121 if (!is_bad_inode(inode)) { 121 if (!is_bad_inode(inode)) {
122 /* Quota is already initialized in iput() */ 122 /* Quota is already initialized in iput() */
123 ext2_xattr_delete_inode(inode); 123 ext2_xattr_delete_inode(inode);
124 vfs_dq_free_inode(inode); 124 dquot_free_inode(inode);
125 vfs_dq_drop(inode); 125 dquot_drop(inode);
126 } 126 }
127 127
128 es = EXT2_SB(sb)->s_es; 128 es = EXT2_SB(sb)->s_es;
@@ -586,10 +586,10 @@ got:
586 goto fail_drop; 586 goto fail_drop;
587 } 587 }
588 588
589 if (vfs_dq_alloc_inode(inode)) { 589 dquot_initialize(inode);
590 err = -EDQUOT; 590 err = dquot_alloc_inode(inode);
591 if (err)
591 goto fail_drop; 592 goto fail_drop;
592 }
593 593
594 err = ext2_init_acl(inode, dir); 594 err = ext2_init_acl(inode, dir);
595 if (err) 595 if (err)
@@ -605,10 +605,10 @@ got:
605 return inode; 605 return inode;
606 606
607fail_free_drop: 607fail_free_drop:
608 vfs_dq_free_inode(inode); 608 dquot_free_inode(inode);
609 609
610fail_drop: 610fail_drop:
611 vfs_dq_drop(inode); 611 dquot_drop(inode);
612 inode->i_flags |= S_NOQUOTA; 612 inode->i_flags |= S_NOQUOTA;
613 inode->i_nlink = 0; 613 inode->i_nlink = 0;
614 unlock_new_inode(inode); 614 unlock_new_inode(inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 71b032c65a02..fc13cc119aad 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -41,6 +41,8 @@ MODULE_AUTHOR("Remy Card and others");
41MODULE_DESCRIPTION("Second Extended Filesystem"); 41MODULE_DESCRIPTION("Second Extended Filesystem");
42MODULE_LICENSE("GPL"); 42MODULE_LICENSE("GPL");
43 43
44static int __ext2_write_inode(struct inode *inode, int do_sync);
45
44/* 46/*
45 * Test whether an inode is a fast symlink. 47 * Test whether an inode is a fast symlink.
46 */ 48 */
@@ -58,13 +60,15 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
58 */ 60 */
59void ext2_delete_inode (struct inode * inode) 61void ext2_delete_inode (struct inode * inode)
60{ 62{
63 if (!is_bad_inode(inode))
64 dquot_initialize(inode);
61 truncate_inode_pages(&inode->i_data, 0); 65 truncate_inode_pages(&inode->i_data, 0);
62 66
63 if (is_bad_inode(inode)) 67 if (is_bad_inode(inode))
64 goto no_delete; 68 goto no_delete;
65 EXT2_I(inode)->i_dtime = get_seconds(); 69 EXT2_I(inode)->i_dtime = get_seconds();
66 mark_inode_dirty(inode); 70 mark_inode_dirty(inode);
67 ext2_write_inode(inode, inode_needs_sync(inode)); 71 __ext2_write_inode(inode, inode_needs_sync(inode));
68 72
69 inode->i_size = 0; 73 inode->i_size = 0;
70 if (inode->i_blocks) 74 if (inode->i_blocks)
@@ -1335,7 +1339,7 @@ bad_inode:
1335 return ERR_PTR(ret); 1339 return ERR_PTR(ret);
1336} 1340}
1337 1341
1338int ext2_write_inode(struct inode *inode, int do_sync) 1342static int __ext2_write_inode(struct inode *inode, int do_sync)
1339{ 1343{
1340 struct ext2_inode_info *ei = EXT2_I(inode); 1344 struct ext2_inode_info *ei = EXT2_I(inode);
1341 struct super_block *sb = inode->i_sb; 1345 struct super_block *sb = inode->i_sb;
@@ -1440,6 +1444,11 @@ int ext2_write_inode(struct inode *inode, int do_sync)
1440 return err; 1444 return err;
1441} 1445}
1442 1446
1447int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
1448{
1449 return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1450}
1451
1443int ext2_sync_inode(struct inode *inode) 1452int ext2_sync_inode(struct inode *inode)
1444{ 1453{
1445 struct writeback_control wbc = { 1454 struct writeback_control wbc = {
@@ -1457,9 +1466,12 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1457 error = inode_change_ok(inode, iattr); 1466 error = inode_change_ok(inode, iattr);
1458 if (error) 1467 if (error)
1459 return error; 1468 return error;
1469
1470 if (iattr->ia_valid & ATTR_SIZE)
1471 dquot_initialize(inode);
1460 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || 1472 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
1461 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { 1473 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
1462 error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0; 1474 error = dquot_transfer(inode, iattr);
1463 if (error) 1475 if (error)
1464 return error; 1476 return error;
1465 } 1477 }
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index dd7175ce5606..71efb0e9a3f2 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <linux/pagemap.h> 33#include <linux/pagemap.h>
34#include <linux/quotaops.h>
34#include "ext2.h" 35#include "ext2.h"
35#include "xattr.h" 36#include "xattr.h"
36#include "acl.h" 37#include "acl.h"
@@ -99,24 +100,27 @@ struct dentry *ext2_get_parent(struct dentry *child)
99 */ 100 */
100static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd) 101static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd)
101{ 102{
102 struct inode * inode = ext2_new_inode (dir, mode); 103 struct inode *inode;
103 int err = PTR_ERR(inode); 104
104 if (!IS_ERR(inode)) { 105 dquot_initialize(dir);
105 inode->i_op = &ext2_file_inode_operations; 106
106 if (ext2_use_xip(inode->i_sb)) { 107 inode = ext2_new_inode(dir, mode);
107 inode->i_mapping->a_ops = &ext2_aops_xip; 108 if (IS_ERR(inode))
108 inode->i_fop = &ext2_xip_file_operations; 109 return PTR_ERR(inode);
109 } else if (test_opt(inode->i_sb, NOBH)) { 110
110 inode->i_mapping->a_ops = &ext2_nobh_aops; 111 inode->i_op = &ext2_file_inode_operations;
111 inode->i_fop = &ext2_file_operations; 112 if (ext2_use_xip(inode->i_sb)) {
112 } else { 113 inode->i_mapping->a_ops = &ext2_aops_xip;
113 inode->i_mapping->a_ops = &ext2_aops; 114 inode->i_fop = &ext2_xip_file_operations;
114 inode->i_fop = &ext2_file_operations; 115 } else if (test_opt(inode->i_sb, NOBH)) {
115 } 116 inode->i_mapping->a_ops = &ext2_nobh_aops;
116 mark_inode_dirty(inode); 117 inode->i_fop = &ext2_file_operations;
117 err = ext2_add_nondir(dentry, inode); 118 } else {
119 inode->i_mapping->a_ops = &ext2_aops;
120 inode->i_fop = &ext2_file_operations;
118 } 121 }
119 return err; 122 mark_inode_dirty(inode);
123 return ext2_add_nondir(dentry, inode);
120} 124}
121 125
122static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) 126static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev)
@@ -127,6 +131,8 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_
127 if (!new_valid_dev(rdev)) 131 if (!new_valid_dev(rdev))
128 return -EINVAL; 132 return -EINVAL;
129 133
134 dquot_initialize(dir);
135
130 inode = ext2_new_inode (dir, mode); 136 inode = ext2_new_inode (dir, mode);
131 err = PTR_ERR(inode); 137 err = PTR_ERR(inode);
132 if (!IS_ERR(inode)) { 138 if (!IS_ERR(inode)) {
@@ -151,6 +157,8 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
151 if (l > sb->s_blocksize) 157 if (l > sb->s_blocksize)
152 goto out; 158 goto out;
153 159
160 dquot_initialize(dir);
161
154 inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO); 162 inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO);
155 err = PTR_ERR(inode); 163 err = PTR_ERR(inode);
156 if (IS_ERR(inode)) 164 if (IS_ERR(inode))
@@ -194,6 +202,8 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
194 if (inode->i_nlink >= EXT2_LINK_MAX) 202 if (inode->i_nlink >= EXT2_LINK_MAX)
195 return -EMLINK; 203 return -EMLINK;
196 204
205 dquot_initialize(dir);
206
197 inode->i_ctime = CURRENT_TIME_SEC; 207 inode->i_ctime = CURRENT_TIME_SEC;
198 inode_inc_link_count(inode); 208 inode_inc_link_count(inode);
199 atomic_inc(&inode->i_count); 209 atomic_inc(&inode->i_count);
@@ -216,6 +226,8 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
216 if (dir->i_nlink >= EXT2_LINK_MAX) 226 if (dir->i_nlink >= EXT2_LINK_MAX)
217 goto out; 227 goto out;
218 228
229 dquot_initialize(dir);
230
219 inode_inc_link_count(dir); 231 inode_inc_link_count(dir);
220 232
221 inode = ext2_new_inode (dir, S_IFDIR | mode); 233 inode = ext2_new_inode (dir, S_IFDIR | mode);
@@ -262,6 +274,8 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry)
262 struct page * page; 274 struct page * page;
263 int err = -ENOENT; 275 int err = -ENOENT;
264 276
277 dquot_initialize(dir);
278
265 de = ext2_find_entry (dir, &dentry->d_name, &page); 279 de = ext2_find_entry (dir, &dentry->d_name, &page);
266 if (!de) 280 if (!de)
267 goto out; 281 goto out;
@@ -304,6 +318,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
304 struct ext2_dir_entry_2 * old_de; 318 struct ext2_dir_entry_2 * old_de;
305 int err = -ENOENT; 319 int err = -ENOENT;
306 320
321 dquot_initialize(old_dir);
322 dquot_initialize(new_dir);
323
307 old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page); 324 old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page);
308 if (!old_de) 325 if (!old_de)
309 goto out; 326 goto out;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f9cb54a585ce..42e4a303b675 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -194,6 +194,8 @@ static void destroy_inodecache(void)
194static void ext2_clear_inode(struct inode *inode) 194static void ext2_clear_inode(struct inode *inode)
195{ 195{
196 struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info; 196 struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info;
197
198 dquot_drop(inode);
197 ext2_discard_reservation(inode); 199 ext2_discard_reservation(inode);
198 EXT2_I(inode)->i_block_alloc_info = NULL; 200 EXT2_I(inode)->i_block_alloc_info = NULL;
199 if (unlikely(rsv)) 201 if (unlikely(rsv))
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index 4e2426e22bbe..565cf817bbf1 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -32,6 +32,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
32 .readlink = generic_readlink, 32 .readlink = generic_readlink,
33 .follow_link = page_follow_link_light, 33 .follow_link = page_follow_link_light,
34 .put_link = page_put_link, 34 .put_link = page_put_link,
35 .setattr = ext2_setattr,
35#ifdef CONFIG_EXT2_FS_XATTR 36#ifdef CONFIG_EXT2_FS_XATTR
36 .setxattr = generic_setxattr, 37 .setxattr = generic_setxattr,
37 .getxattr = generic_getxattr, 38 .getxattr = generic_getxattr,
@@ -43,6 +44,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
43const struct inode_operations ext2_fast_symlink_inode_operations = { 44const struct inode_operations ext2_fast_symlink_inode_operations = {
44 .readlink = generic_readlink, 45 .readlink = generic_readlink,
45 .follow_link = ext2_follow_link, 46 .follow_link = ext2_follow_link,
47 .setattr = ext2_setattr,
46#ifdef CONFIG_EXT2_FS_XATTR 48#ifdef CONFIG_EXT2_FS_XATTR
47 .setxattr = generic_setxattr, 49 .setxattr = generic_setxattr,
48 .getxattr = generic_getxattr, 50 .getxattr = generic_getxattr,
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 904f00642f84..e44dc92609be 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -644,8 +644,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
644 the inode. */ 644 the inode. */
645 ea_bdebug(new_bh, "reusing block"); 645 ea_bdebug(new_bh, "reusing block");
646 646
647 error = -EDQUOT; 647 error = dquot_alloc_block(inode, 1);
648 if (vfs_dq_alloc_block(inode, 1)) { 648 if (error) {
649 unlock_buffer(new_bh); 649 unlock_buffer(new_bh);
650 goto cleanup; 650 goto cleanup;
651 } 651 }
@@ -702,7 +702,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
702 * as if nothing happened and cleanup the unused block */ 702 * as if nothing happened and cleanup the unused block */
703 if (error && error != -ENOSPC) { 703 if (error && error != -ENOSPC) {
704 if (new_bh && new_bh != old_bh) 704 if (new_bh && new_bh != old_bh)
705 vfs_dq_free_block(inode, 1); 705 dquot_free_block(inode, 1);
706 goto cleanup; 706 goto cleanup;
707 } 707 }
708 } else 708 } else
@@ -734,7 +734,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
734 le32_add_cpu(&HDR(old_bh)->h_refcount, -1); 734 le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
735 if (ce) 735 if (ce)
736 mb_cache_entry_release(ce); 736 mb_cache_entry_release(ce);
737 vfs_dq_free_block(inode, 1); 737 dquot_free_block(inode, 1);
738 mark_buffer_dirty(old_bh); 738 mark_buffer_dirty(old_bh);
739 ea_bdebug(old_bh, "refcount now=%d", 739 ea_bdebug(old_bh, "refcount now=%d",
740 le32_to_cpu(HDR(old_bh)->h_refcount)); 740 le32_to_cpu(HDR(old_bh)->h_refcount));
@@ -797,7 +797,7 @@ ext2_xattr_delete_inode(struct inode *inode)
797 mark_buffer_dirty(bh); 797 mark_buffer_dirty(bh);
798 if (IS_SYNC(inode)) 798 if (IS_SYNC(inode))
799 sync_dirty_buffer(bh); 799 sync_dirty_buffer(bh);
800 vfs_dq_free_block(inode, 1); 800 dquot_free_block(inode, 1);
801 } 801 }
802 EXT2_I(inode)->i_file_acl = 0; 802 EXT2_I(inode)->i_file_acl = 0;
803 803
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index c8155845ac05..b118c6383c6d 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -4,6 +4,7 @@
4 */ 4 */
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/slab.h>
7#include <linux/string.h> 8#include <linux/string.h>
8#include <linux/fs.h> 9#include <linux/fs.h>
9#include <linux/ext2_fs.h> 10#include <linux/ext2_fs.h>
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 27967f92e820..a177122a1b25 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -14,6 +14,7 @@
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/capability.h> 15#include <linux/capability.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/slab.h>
17#include <linux/jbd.h> 18#include <linux/jbd.h>
18#include <linux/ext3_fs.h> 19#include <linux/ext3_fs.h>
19#include <linux/ext3_jbd.h> 20#include <linux/ext3_jbd.h>
@@ -676,7 +677,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
676 } 677 }
677 ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); 678 ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
678 if (dquot_freed_blocks) 679 if (dquot_freed_blocks)
679 vfs_dq_free_block(inode, dquot_freed_blocks); 680 dquot_free_block(inode, dquot_freed_blocks);
680 return; 681 return;
681} 682}
682 683
@@ -1502,8 +1503,9 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
1502 /* 1503 /*
1503 * Check quota for allocation of this block. 1504 * Check quota for allocation of this block.
1504 */ 1505 */
1505 if (vfs_dq_alloc_block(inode, num)) { 1506 err = dquot_alloc_block(inode, num);
1506 *errp = -EDQUOT; 1507 if (err) {
1508 *errp = err;
1507 return 0; 1509 return 0;
1508 } 1510 }
1509 1511
@@ -1713,7 +1715,7 @@ allocated:
1713 1715
1714 *errp = 0; 1716 *errp = 0;
1715 brelse(bitmap_bh); 1717 brelse(bitmap_bh);
1716 vfs_dq_free_block(inode, *count-num); 1718 dquot_free_block(inode, *count-num);
1717 *count = num; 1719 *count = num;
1718 return ret_block; 1720 return ret_block;
1719 1721
@@ -1728,7 +1730,7 @@ out:
1728 * Undo the block allocation 1730 * Undo the block allocation
1729 */ 1731 */
1730 if (!performed_allocation) 1732 if (!performed_allocation)
1731 vfs_dq_free_block(inode, *count); 1733 dquot_free_block(inode, *count);
1732 brelse(bitmap_bh); 1734 brelse(bitmap_bh);
1733 return 0; 1735 return 0;
1734} 1736}
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 388bbdfa0b4e..f55df0e61cbd 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -21,6 +21,7 @@
21#include <linux/time.h> 21#include <linux/time.h>
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/jbd.h> 23#include <linux/jbd.h>
24#include <linux/quotaops.h>
24#include <linux/ext3_fs.h> 25#include <linux/ext3_fs.h>
25#include <linux/ext3_jbd.h> 26#include <linux/ext3_jbd.h>
26#include "xattr.h" 27#include "xattr.h"
@@ -33,9 +34,9 @@
33 */ 34 */
34static int ext3_release_file (struct inode * inode, struct file * filp) 35static int ext3_release_file (struct inode * inode, struct file * filp)
35{ 36{
36 if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) { 37 if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) {
37 filemap_flush(inode->i_mapping); 38 filemap_flush(inode->i_mapping);
38 EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE; 39 ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
39 } 40 }
40 /* if we are the last writer on the inode, drop the block reservation */ 41 /* if we are the last writer on the inode, drop the block reservation */
41 if ((filp->f_mode & FMODE_WRITE) && 42 if ((filp->f_mode & FMODE_WRITE) &&
@@ -62,7 +63,7 @@ const struct file_operations ext3_file_operations = {
62 .compat_ioctl = ext3_compat_ioctl, 63 .compat_ioctl = ext3_compat_ioctl,
63#endif 64#endif
64 .mmap = generic_file_mmap, 65 .mmap = generic_file_mmap,
65 .open = generic_file_open, 66 .open = dquot_file_open,
66 .release = ext3_release_file, 67 .release = ext3_release_file,
67 .fsync = ext3_sync_file, 68 .fsync = ext3_sync_file,
68 .splice_read = generic_file_splice_read, 69 .splice_read = generic_file_splice_read,
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index b39991285136..0d0e97ed3ff6 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -123,10 +123,10 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
123 * Note: we must free any quota before locking the superblock, 123 * Note: we must free any quota before locking the superblock,
124 * as writing the quota to disk may need the lock as well. 124 * as writing the quota to disk may need the lock as well.
125 */ 125 */
126 vfs_dq_init(inode); 126 dquot_initialize(inode);
127 ext3_xattr_delete_inode(handle, inode); 127 ext3_xattr_delete_inode(handle, inode);
128 vfs_dq_free_inode(inode); 128 dquot_free_inode(inode);
129 vfs_dq_drop(inode); 129 dquot_drop(inode);
130 130
131 is_directory = S_ISDIR(inode->i_mode); 131 is_directory = S_ISDIR(inode->i_mode);
132 132
@@ -582,16 +582,18 @@ got:
582 inode->i_generation = sbi->s_next_generation++; 582 inode->i_generation = sbi->s_next_generation++;
583 spin_unlock(&sbi->s_next_gen_lock); 583 spin_unlock(&sbi->s_next_gen_lock);
584 584
585 ei->i_state = EXT3_STATE_NEW; 585 ei->i_state_flags = 0;
586 ext3_set_inode_state(inode, EXT3_STATE_NEW);
587
586 ei->i_extra_isize = 588 ei->i_extra_isize =
587 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ? 589 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
588 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; 590 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
589 591
590 ret = inode; 592 ret = inode;
591 if (vfs_dq_alloc_inode(inode)) { 593 dquot_initialize(inode);
592 err = -EDQUOT; 594 err = dquot_alloc_inode(inode);
595 if (err)
593 goto fail_drop; 596 goto fail_drop;
594 }
595 597
596 err = ext3_init_acl(handle, inode, dir); 598 err = ext3_init_acl(handle, inode, dir);
597 if (err) 599 if (err)
@@ -619,10 +621,10 @@ really_out:
619 return ret; 621 return ret;
620 622
621fail_free_drop: 623fail_free_drop:
622 vfs_dq_free_inode(inode); 624 dquot_free_inode(inode);
623 625
624fail_drop: 626fail_drop:
625 vfs_dq_drop(inode); 627 dquot_drop(inode);
626 inode->i_flags |= S_NOQUOTA; 628 inode->i_flags |= S_NOQUOTA;
627 inode->i_nlink = 0; 629 inode->i_nlink = 0;
628 unlock_new_inode(inode); 630 unlock_new_inode(inode);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 455e6e6e5cb9..ea33bdf0a300 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -196,6 +196,9 @@ void ext3_delete_inode (struct inode * inode)
196{ 196{
197 handle_t *handle; 197 handle_t *handle;
198 198
199 if (!is_bad_inode(inode))
200 dquot_initialize(inode);
201
199 truncate_inode_pages(&inode->i_data, 0); 202 truncate_inode_pages(&inode->i_data, 0);
200 203
201 if (is_bad_inode(inode)) 204 if (is_bad_inode(inode))
@@ -1378,7 +1381,7 @@ static int ext3_journalled_write_end(struct file *file,
1378 */ 1381 */
1379 if (pos + len > inode->i_size && ext3_can_truncate(inode)) 1382 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1380 ext3_orphan_add(handle, inode); 1383 ext3_orphan_add(handle, inode);
1381 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; 1384 ext3_set_inode_state(inode, EXT3_STATE_JDATA);
1382 if (inode->i_size > EXT3_I(inode)->i_disksize) { 1385 if (inode->i_size > EXT3_I(inode)->i_disksize) {
1383 EXT3_I(inode)->i_disksize = inode->i_size; 1386 EXT3_I(inode)->i_disksize = inode->i_size;
1384 ret2 = ext3_mark_inode_dirty(handle, inode); 1387 ret2 = ext3_mark_inode_dirty(handle, inode);
@@ -1417,7 +1420,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1417 journal_t *journal; 1420 journal_t *journal;
1418 int err; 1421 int err;
1419 1422
1420 if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) { 1423 if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) {
1421 /* 1424 /*
1422 * This is a REALLY heavyweight approach, but the use of 1425 * This is a REALLY heavyweight approach, but the use of
1423 * bmap on dirty files is expected to be extremely rare: 1426 * bmap on dirty files is expected to be extremely rare:
@@ -1436,7 +1439,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1436 * everything they get. 1439 * everything they get.
1437 */ 1440 */
1438 1441
1439 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA; 1442 ext3_clear_inode_state(inode, EXT3_STATE_JDATA);
1440 journal = EXT3_JOURNAL(inode); 1443 journal = EXT3_JOURNAL(inode);
1441 journal_lock_updates(journal); 1444 journal_lock_updates(journal);
1442 err = journal_flush(journal); 1445 err = journal_flush(journal);
@@ -1528,6 +1531,7 @@ static int ext3_ordered_writepage(struct page *page,
1528 int err; 1531 int err;
1529 1532
1530 J_ASSERT(PageLocked(page)); 1533 J_ASSERT(PageLocked(page));
1534 WARN_ON_ONCE(IS_RDONLY(inode));
1531 1535
1532 /* 1536 /*
1533 * We give up here if we're reentered, because it might be for a 1537 * We give up here if we're reentered, because it might be for a
@@ -1600,6 +1604,9 @@ static int ext3_writeback_writepage(struct page *page,
1600 int ret = 0; 1604 int ret = 0;
1601 int err; 1605 int err;
1602 1606
1607 J_ASSERT(PageLocked(page));
1608 WARN_ON_ONCE(IS_RDONLY(inode));
1609
1603 if (ext3_journal_current_handle()) 1610 if (ext3_journal_current_handle())
1604 goto out_fail; 1611 goto out_fail;
1605 1612
@@ -1642,6 +1649,9 @@ static int ext3_journalled_writepage(struct page *page,
1642 int ret = 0; 1649 int ret = 0;
1643 int err; 1650 int err;
1644 1651
1652 J_ASSERT(PageLocked(page));
1653 WARN_ON_ONCE(IS_RDONLY(inode));
1654
1645 if (ext3_journal_current_handle()) 1655 if (ext3_journal_current_handle())
1646 goto no_write; 1656 goto no_write;
1647 1657
@@ -1670,7 +1680,7 @@ static int ext3_journalled_writepage(struct page *page,
1670 PAGE_CACHE_SIZE, NULL, write_end_fn); 1680 PAGE_CACHE_SIZE, NULL, write_end_fn);
1671 if (ret == 0) 1681 if (ret == 0)
1672 ret = err; 1682 ret = err;
1673 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; 1683 ext3_set_inode_state(inode, EXT3_STATE_JDATA);
1674 unlock_page(page); 1684 unlock_page(page);
1675 } else { 1685 } else {
1676 /* 1686 /*
@@ -1785,8 +1795,9 @@ retry:
1785 handle = ext3_journal_start(inode, 2); 1795 handle = ext3_journal_start(inode, 2);
1786 if (IS_ERR(handle)) { 1796 if (IS_ERR(handle)) {
1787 /* This is really bad luck. We've written the data 1797 /* This is really bad luck. We've written the data
1788 * but cannot extend i_size. Bail out and pretend 1798 * but cannot extend i_size. Truncate allocated blocks
1789 * the write failed... */ 1799 * and pretend the write failed... */
1800 ext3_truncate(inode);
1790 ret = PTR_ERR(handle); 1801 ret = PTR_ERR(handle);
1791 goto out; 1802 goto out;
1792 } 1803 }
@@ -2402,7 +2413,7 @@ void ext3_truncate(struct inode *inode)
2402 goto out_notrans; 2413 goto out_notrans;
2403 2414
2404 if (inode->i_size == 0 && ext3_should_writeback_data(inode)) 2415 if (inode->i_size == 0 && ext3_should_writeback_data(inode))
2405 ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE; 2416 ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
2406 2417
2407 /* 2418 /*
2408 * We have to lock the EOF page here, because lock_page() nests 2419 * We have to lock the EOF page here, because lock_page() nests
@@ -2721,7 +2732,7 @@ int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
2721{ 2732{
2722 /* We have all inode data except xattrs in memory here. */ 2733 /* We have all inode data except xattrs in memory here. */
2723 return __ext3_get_inode_loc(inode, iloc, 2734 return __ext3_get_inode_loc(inode, iloc,
2724 !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)); 2735 !ext3_test_inode_state(inode, EXT3_STATE_XATTR));
2725} 2736}
2726 2737
2727void ext3_set_inode_flags(struct inode *inode) 2738void ext3_set_inode_flags(struct inode *inode)
@@ -2800,7 +2811,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2800 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); 2811 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
2801 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0; 2812 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2802 2813
2803 ei->i_state = 0; 2814 ei->i_state_flags = 0;
2804 ei->i_dir_start_lookup = 0; 2815 ei->i_dir_start_lookup = 0;
2805 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 2816 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2806 /* We now have enough fields to check if the inode was active or not. 2817 /* We now have enough fields to check if the inode was active or not.
@@ -2893,7 +2904,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2893 EXT3_GOOD_OLD_INODE_SIZE + 2904 EXT3_GOOD_OLD_INODE_SIZE +
2894 ei->i_extra_isize; 2905 ei->i_extra_isize;
2895 if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) 2906 if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
2896 ei->i_state |= EXT3_STATE_XATTR; 2907 ext3_set_inode_state(inode, EXT3_STATE_XATTR);
2897 } 2908 }
2898 } else 2909 } else
2899 ei->i_extra_isize = 0; 2910 ei->i_extra_isize = 0;
@@ -2955,7 +2966,7 @@ again:
2955 2966
2956 /* For fields not not tracking in the in-memory inode, 2967 /* For fields not not tracking in the in-memory inode,
2957 * initialise them to zero for new inodes. */ 2968 * initialise them to zero for new inodes. */
2958 if (ei->i_state & EXT3_STATE_NEW) 2969 if (ext3_test_inode_state(inode, EXT3_STATE_NEW))
2959 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); 2970 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2960 2971
2961 ext3_get_inode_flags(ei); 2972 ext3_get_inode_flags(ei);
@@ -3052,7 +3063,7 @@ again:
3052 rc = ext3_journal_dirty_metadata(handle, bh); 3063 rc = ext3_journal_dirty_metadata(handle, bh);
3053 if (!err) 3064 if (!err)
3054 err = rc; 3065 err = rc;
3055 ei->i_state &= ~EXT3_STATE_NEW; 3066 ext3_clear_inode_state(inode, EXT3_STATE_NEW);
3056 3067
3057 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); 3068 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
3058out_brelse: 3069out_brelse:
@@ -3096,7 +3107,7 @@ out_brelse:
3096 * `stuff()' is running, and the new i_size will be lost. Plus the inode 3107 * `stuff()' is running, and the new i_size will be lost. Plus the inode
3097 * will no longer be on the superblock's dirty inode list. 3108 * will no longer be on the superblock's dirty inode list.
3098 */ 3109 */
3099int ext3_write_inode(struct inode *inode, int wait) 3110int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
3100{ 3111{
3101 if (current->flags & PF_MEMALLOC) 3112 if (current->flags & PF_MEMALLOC)
3102 return 0; 3113 return 0;
@@ -3107,7 +3118,7 @@ int ext3_write_inode(struct inode *inode, int wait)
3107 return -EIO; 3118 return -EIO;
3108 } 3119 }
3109 3120
3110 if (!wait) 3121 if (wbc->sync_mode != WB_SYNC_ALL)
3111 return 0; 3122 return 0;
3112 3123
3113 return ext3_force_commit(inode->i_sb); 3124 return ext3_force_commit(inode->i_sb);
@@ -3140,6 +3151,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3140 if (error) 3151 if (error)
3141 return error; 3152 return error;
3142 3153
3154 if (ia_valid & ATTR_SIZE)
3155 dquot_initialize(inode);
3143 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 3156 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
3144 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 3157 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
3145 handle_t *handle; 3158 handle_t *handle;
@@ -3152,7 +3165,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3152 error = PTR_ERR(handle); 3165 error = PTR_ERR(handle);
3153 goto err_out; 3166 goto err_out;
3154 } 3167 }
3155 error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; 3168 error = dquot_transfer(inode, attr);
3156 if (error) { 3169 if (error) {
3157 ext3_journal_stop(handle); 3170 ext3_journal_stop(handle);
3158 return error; 3171 return error;
@@ -3237,7 +3250,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
3237 ret = 2 * (bpp + indirects) + 2; 3250 ret = 2 * (bpp + indirects) + 2;
3238 3251
3239#ifdef CONFIG_QUOTA 3252#ifdef CONFIG_QUOTA
3240 /* We know that structure was already allocated during vfs_dq_init so 3253 /* We know that structure was already allocated during dquot_initialize so
3241 * we will be updating only the data blocks + inodes */ 3254 * we will be updating only the data blocks + inodes */
3242 ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 3255 ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
3243#endif 3256#endif
@@ -3328,7 +3341,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3328 * i_size has been changed by generic_commit_write() and we thus need 3341 * i_size has been changed by generic_commit_write() and we thus need
3329 * to include the updated inode in the current transaction. 3342 * to include the updated inode in the current transaction.
3330 * 3343 *
3331 * Also, vfs_dq_alloc_space() will always dirty the inode when blocks 3344 * Also, dquot_alloc_space() will always dirty the inode when blocks
3332 * are allocated to the file. 3345 * are allocated to the file.
3333 * 3346 *
3334 * If the inode is marked synchronous, we don't honour that here - doing 3347 * If the inode is marked synchronous, we don't honour that here - doing
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 7b0e44f7d66f..ee184084ca42 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1696,6 +1696,8 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
1696 struct inode * inode; 1696 struct inode * inode;
1697 int err, retries = 0; 1697 int err, retries = 0;
1698 1698
1699 dquot_initialize(dir);
1700
1699retry: 1701retry:
1700 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 1702 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1701 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1703 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1730,6 +1732,8 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry,
1730 if (!new_valid_dev(rdev)) 1732 if (!new_valid_dev(rdev))
1731 return -EINVAL; 1733 return -EINVAL;
1732 1734
1735 dquot_initialize(dir);
1736
1733retry: 1737retry:
1734 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 1738 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1735 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1739 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1766,6 +1770,8 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
1766 if (dir->i_nlink >= EXT3_LINK_MAX) 1770 if (dir->i_nlink >= EXT3_LINK_MAX)
1767 return -EMLINK; 1771 return -EMLINK;
1768 1772
1773 dquot_initialize(dir);
1774
1769retry: 1775retry:
1770 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 1776 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1771 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1777 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -2060,7 +2066,9 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
2060 2066
2061 /* Initialize quotas before so that eventual writes go in 2067 /* Initialize quotas before so that eventual writes go in
2062 * separate transaction */ 2068 * separate transaction */
2063 vfs_dq_init(dentry->d_inode); 2069 dquot_initialize(dir);
2070 dquot_initialize(dentry->d_inode);
2071
2064 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); 2072 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2065 if (IS_ERR(handle)) 2073 if (IS_ERR(handle))
2066 return PTR_ERR(handle); 2074 return PTR_ERR(handle);
@@ -2119,7 +2127,9 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
2119 2127
2120 /* Initialize quotas before so that eventual writes go 2128 /* Initialize quotas before so that eventual writes go
2121 * in separate transaction */ 2129 * in separate transaction */
2122 vfs_dq_init(dentry->d_inode); 2130 dquot_initialize(dir);
2131 dquot_initialize(dentry->d_inode);
2132
2123 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); 2133 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2124 if (IS_ERR(handle)) 2134 if (IS_ERR(handle))
2125 return PTR_ERR(handle); 2135 return PTR_ERR(handle);
@@ -2174,6 +2184,8 @@ static int ext3_symlink (struct inode * dir,
2174 if (l > dir->i_sb->s_blocksize) 2184 if (l > dir->i_sb->s_blocksize)
2175 return -ENAMETOOLONG; 2185 return -ENAMETOOLONG;
2176 2186
2187 dquot_initialize(dir);
2188
2177retry: 2189retry:
2178 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 2190 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2179 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + 2191 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
@@ -2228,6 +2240,9 @@ static int ext3_link (struct dentry * old_dentry,
2228 2240
2229 if (inode->i_nlink >= EXT3_LINK_MAX) 2241 if (inode->i_nlink >= EXT3_LINK_MAX)
2230 return -EMLINK; 2242 return -EMLINK;
2243
2244 dquot_initialize(dir);
2245
2231 /* 2246 /*
2232 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing 2247 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
2233 * otherwise has the potential to corrupt the orphan inode list. 2248 * otherwise has the potential to corrupt the orphan inode list.
@@ -2278,12 +2293,15 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2278 struct ext3_dir_entry_2 * old_de, * new_de; 2293 struct ext3_dir_entry_2 * old_de, * new_de;
2279 int retval, flush_file = 0; 2294 int retval, flush_file = 0;
2280 2295
2296 dquot_initialize(old_dir);
2297 dquot_initialize(new_dir);
2298
2281 old_bh = new_bh = dir_bh = NULL; 2299 old_bh = new_bh = dir_bh = NULL;
2282 2300
2283 /* Initialize quotas before so that eventual writes go 2301 /* Initialize quotas before so that eventual writes go
2284 * in separate transaction */ 2302 * in separate transaction */
2285 if (new_dentry->d_inode) 2303 if (new_dentry->d_inode)
2286 vfs_dq_init(new_dentry->d_inode); 2304 dquot_initialize(new_dentry->d_inode);
2287 handle = ext3_journal_start(old_dir, 2 * 2305 handle = ext3_journal_start(old_dir, 2 *
2288 EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + 2306 EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2289 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); 2307 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index afa2b569da10..1bee604cc6cd 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -164,7 +164,7 @@ void ext3_msg(struct super_block *sb, const char *prefix,
164 * write out the superblock safely. 164 * write out the superblock safely.
165 * 165 *
166 * We'll just use the journal_abort() error code to record an error in 166 * We'll just use the journal_abort() error code to record an error in
167 * the journal instead. On recovery, the journal will compain about 167 * the journal instead. On recovery, the journal will complain about
168 * that error until we've noted it down and cleared it. 168 * that error until we've noted it down and cleared it.
169 */ 169 */
170 170
@@ -181,7 +181,7 @@ static void ext3_handle_error(struct super_block *sb)
181 if (!test_opt (sb, ERRORS_CONT)) { 181 if (!test_opt (sb, ERRORS_CONT)) {
182 journal_t *journal = EXT3_SB(sb)->s_journal; 182 journal_t *journal = EXT3_SB(sb)->s_journal;
183 183
184 EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; 184 set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
185 if (journal) 185 if (journal)
186 journal_abort(journal, -EIO); 186 journal_abort(journal, -EIO);
187 } 187 }
@@ -296,7 +296,7 @@ void ext3_abort (struct super_block * sb, const char * function,
296 "error: remounting filesystem read-only"); 296 "error: remounting filesystem read-only");
297 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; 297 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
298 sb->s_flags |= MS_RDONLY; 298 sb->s_flags |= MS_RDONLY;
299 EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; 299 set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
300 if (EXT3_SB(sb)->s_journal) 300 if (EXT3_SB(sb)->s_journal)
301 journal_abort(EXT3_SB(sb)->s_journal, -EIO); 301 journal_abort(EXT3_SB(sb)->s_journal, -EIO);
302} 302}
@@ -528,6 +528,8 @@ static void destroy_inodecache(void)
528static void ext3_clear_inode(struct inode *inode) 528static void ext3_clear_inode(struct inode *inode)
529{ 529{
530 struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info; 530 struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
531
532 dquot_drop(inode);
531 ext3_discard_reservation(inode); 533 ext3_discard_reservation(inode);
532 EXT3_I(inode)->i_block_alloc_info = NULL; 534 EXT3_I(inode)->i_block_alloc_info = NULL;
533 if (unlikely(rsv)) 535 if (unlikely(rsv))
@@ -562,10 +564,10 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
562 if (sbi->s_qf_names[GRPQUOTA]) 564 if (sbi->s_qf_names[GRPQUOTA])
563 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); 565 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
564 566
565 if (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) 567 if (test_opt(sb, USRQUOTA))
566 seq_puts(seq, ",usrquota"); 568 seq_puts(seq, ",usrquota");
567 569
568 if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) 570 if (test_opt(sb, GRPQUOTA))
569 seq_puts(seq, ",grpquota"); 571 seq_puts(seq, ",grpquota");
570#endif 572#endif
571} 573}
@@ -656,8 +658,7 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
656 if (test_opt(sb, NOBH)) 658 if (test_opt(sb, NOBH))
657 seq_puts(seq, ",nobh"); 659 seq_puts(seq, ",nobh");
658 660
659 seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt & 661 seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
660 EXT3_MOUNT_DATA_FLAGS));
661 if (test_opt(sb, DATA_ERR_ABORT)) 662 if (test_opt(sb, DATA_ERR_ABORT))
662 seq_puts(seq, ",data_err=abort"); 663 seq_puts(seq, ",data_err=abort");
663 664
@@ -751,13 +752,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
751 const char *data, size_t len, loff_t off); 752 const char *data, size_t len, loff_t off);
752 753
753static const struct dquot_operations ext3_quota_operations = { 754static const struct dquot_operations ext3_quota_operations = {
754 .initialize = dquot_initialize,
755 .drop = dquot_drop,
756 .alloc_space = dquot_alloc_space,
757 .alloc_inode = dquot_alloc_inode,
758 .free_space = dquot_free_space,
759 .free_inode = dquot_free_inode,
760 .transfer = dquot_transfer,
761 .write_dquot = ext3_write_dquot, 755 .write_dquot = ext3_write_dquot,
762 .acquire_dquot = ext3_acquire_dquot, 756 .acquire_dquot = ext3_acquire_dquot,
763 .release_dquot = ext3_release_dquot, 757 .release_dquot = ext3_release_dquot,
@@ -896,6 +890,63 @@ static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
896 return sb_block; 890 return sb_block;
897} 891}
898 892
893#ifdef CONFIG_QUOTA
894static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
895{
896 struct ext3_sb_info *sbi = EXT3_SB(sb);
897 char *qname;
898
899 if (sb_any_quota_loaded(sb) &&
900 !sbi->s_qf_names[qtype]) {
901 ext3_msg(sb, KERN_ERR,
902 "Cannot change journaled "
903 "quota options when quota turned on");
904 return 0;
905 }
906 qname = match_strdup(args);
907 if (!qname) {
908 ext3_msg(sb, KERN_ERR,
909 "Not enough memory for storing quotafile name");
910 return 0;
911 }
912 if (sbi->s_qf_names[qtype] &&
913 strcmp(sbi->s_qf_names[qtype], qname)) {
914 ext3_msg(sb, KERN_ERR,
915 "%s quota file already specified", QTYPE2NAME(qtype));
916 kfree(qname);
917 return 0;
918 }
919 sbi->s_qf_names[qtype] = qname;
920 if (strchr(sbi->s_qf_names[qtype], '/')) {
921 ext3_msg(sb, KERN_ERR,
922 "quotafile must be on filesystem root");
923 kfree(sbi->s_qf_names[qtype]);
924 sbi->s_qf_names[qtype] = NULL;
925 return 0;
926 }
927 set_opt(sbi->s_mount_opt, QUOTA);
928 return 1;
929}
930
931static int clear_qf_name(struct super_block *sb, int qtype) {
932
933 struct ext3_sb_info *sbi = EXT3_SB(sb);
934
935 if (sb_any_quota_loaded(sb) &&
936 sbi->s_qf_names[qtype]) {
937 ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options"
938 " when quota turned on");
939 return 0;
940 }
941 /*
942 * The space will be released later when all options are confirmed
943 * to be correct
944 */
945 sbi->s_qf_names[qtype] = NULL;
946 return 1;
947}
948#endif
949
899static int parse_options (char *options, struct super_block *sb, 950static int parse_options (char *options, struct super_block *sb,
900 unsigned int *inum, unsigned long *journal_devnum, 951 unsigned int *inum, unsigned long *journal_devnum,
901 ext3_fsblk_t *n_blocks_count, int is_remount) 952 ext3_fsblk_t *n_blocks_count, int is_remount)
@@ -906,8 +957,7 @@ static int parse_options (char *options, struct super_block *sb,
906 int data_opt = 0; 957 int data_opt = 0;
907 int option; 958 int option;
908#ifdef CONFIG_QUOTA 959#ifdef CONFIG_QUOTA
909 int qtype, qfmt; 960 int qfmt;
910 char *qname;
911#endif 961#endif
912 962
913 if (!options) 963 if (!options)
@@ -1065,20 +1115,19 @@ static int parse_options (char *options, struct super_block *sb,
1065 data_opt = EXT3_MOUNT_WRITEBACK_DATA; 1115 data_opt = EXT3_MOUNT_WRITEBACK_DATA;
1066 datacheck: 1116 datacheck:
1067 if (is_remount) { 1117 if (is_remount) {
1068 if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) 1118 if (test_opt(sb, DATA_FLAGS) == data_opt)
1069 == data_opt)
1070 break; 1119 break;
1071 ext3_msg(sb, KERN_ERR, 1120 ext3_msg(sb, KERN_ERR,
1072 "error: cannot change " 1121 "error: cannot change "
1073 "data mode on remount. The filesystem " 1122 "data mode on remount. The filesystem "
1074 "is mounted in data=%s mode and you " 1123 "is mounted in data=%s mode and you "
1075 "try to remount it in data=%s mode.", 1124 "try to remount it in data=%s mode.",
1076 data_mode_string(sbi->s_mount_opt & 1125 data_mode_string(test_opt(sb,
1077 EXT3_MOUNT_DATA_FLAGS), 1126 DATA_FLAGS)),
1078 data_mode_string(data_opt)); 1127 data_mode_string(data_opt));
1079 return 0; 1128 return 0;
1080 } else { 1129 } else {
1081 sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS; 1130 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
1082 sbi->s_mount_opt |= data_opt; 1131 sbi->s_mount_opt |= data_opt;
1083 } 1132 }
1084 break; 1133 break;
@@ -1090,62 +1139,20 @@ static int parse_options (char *options, struct super_block *sb,
1090 break; 1139 break;
1091#ifdef CONFIG_QUOTA 1140#ifdef CONFIG_QUOTA
1092 case Opt_usrjquota: 1141 case Opt_usrjquota:
1093 qtype = USRQUOTA; 1142 if (!set_qf_name(sb, USRQUOTA, &args[0]))
1094 goto set_qf_name;
1095 case Opt_grpjquota:
1096 qtype = GRPQUOTA;
1097set_qf_name:
1098 if (sb_any_quota_loaded(sb) &&
1099 !sbi->s_qf_names[qtype]) {
1100 ext3_msg(sb, KERN_ERR,
1101 "error: cannot change journaled "
1102 "quota options when quota turned on.");
1103 return 0;
1104 }
1105 qname = match_strdup(&args[0]);
1106 if (!qname) {
1107 ext3_msg(sb, KERN_ERR,
1108 "error: not enough memory for "
1109 "storing quotafile name.");
1110 return 0; 1143 return 0;
1111 } 1144 break;
1112 if (sbi->s_qf_names[qtype] && 1145 case Opt_grpjquota:
1113 strcmp(sbi->s_qf_names[qtype], qname)) { 1146 if (!set_qf_name(sb, GRPQUOTA, &args[0]))
1114 ext3_msg(sb, KERN_ERR,
1115 "error: %s quota file already "
1116 "specified.", QTYPE2NAME(qtype));
1117 kfree(qname);
1118 return 0;
1119 }
1120 sbi->s_qf_names[qtype] = qname;
1121 if (strchr(sbi->s_qf_names[qtype], '/')) {
1122 ext3_msg(sb, KERN_ERR,
1123 "error: quotafile must be on "
1124 "filesystem root.");
1125 kfree(sbi->s_qf_names[qtype]);
1126 sbi->s_qf_names[qtype] = NULL;
1127 return 0; 1147 return 0;
1128 }
1129 set_opt(sbi->s_mount_opt, QUOTA);
1130 break; 1148 break;
1131 case Opt_offusrjquota: 1149 case Opt_offusrjquota:
1132 qtype = USRQUOTA; 1150 if (!clear_qf_name(sb, USRQUOTA))
1133 goto clear_qf_name; 1151 return 0;
1152 break;
1134 case Opt_offgrpjquota: 1153 case Opt_offgrpjquota:
1135 qtype = GRPQUOTA; 1154 if (!clear_qf_name(sb, GRPQUOTA))
1136clear_qf_name:
1137 if (sb_any_quota_loaded(sb) &&
1138 sbi->s_qf_names[qtype]) {
1139 ext3_msg(sb, KERN_ERR, "error: cannot change "
1140 "journaled quota options when "
1141 "quota turned on.");
1142 return 0; 1155 return 0;
1143 }
1144 /*
1145 * The space will be released later when all options
1146 * are confirmed to be correct
1147 */
1148 sbi->s_qf_names[qtype] = NULL;
1149 break; 1156 break;
1150 case Opt_jqfmt_vfsold: 1157 case Opt_jqfmt_vfsold:
1151 qfmt = QFMT_VFS_OLD; 1158 qfmt = QFMT_VFS_OLD;
@@ -1244,18 +1251,12 @@ set_qf_format:
1244 } 1251 }
1245#ifdef CONFIG_QUOTA 1252#ifdef CONFIG_QUOTA
1246 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 1253 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1247 if ((sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) && 1254 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
1248 sbi->s_qf_names[USRQUOTA])
1249 clear_opt(sbi->s_mount_opt, USRQUOTA); 1255 clear_opt(sbi->s_mount_opt, USRQUOTA);
1250 1256 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
1251 if ((sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) &&
1252 sbi->s_qf_names[GRPQUOTA])
1253 clear_opt(sbi->s_mount_opt, GRPQUOTA); 1257 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1254 1258
1255 if ((sbi->s_qf_names[USRQUOTA] && 1259 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
1256 (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) ||
1257 (sbi->s_qf_names[GRPQUOTA] &&
1258 (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) {
1259 ext3_msg(sb, KERN_ERR, "error: old and new quota " 1260 ext3_msg(sb, KERN_ERR, "error: old and new quota "
1260 "format mixing."); 1261 "format mixing.");
1261 return 0; 1262 return 0;
@@ -1478,7 +1479,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1478 } 1479 }
1479 1480
1480 list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); 1481 list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
1481 vfs_dq_init(inode); 1482 dquot_initialize(inode);
1482 if (inode->i_nlink) { 1483 if (inode->i_nlink) {
1483 printk(KERN_DEBUG 1484 printk(KERN_DEBUG
1484 "%s: truncating inode %lu to %Ld bytes\n", 1485 "%s: truncating inode %lu to %Ld bytes\n",
@@ -1671,11 +1672,11 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1671 set_opt(sbi->s_mount_opt, POSIX_ACL); 1672 set_opt(sbi->s_mount_opt, POSIX_ACL);
1672#endif 1673#endif
1673 if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA) 1674 if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
1674 sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA; 1675 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
1675 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED) 1676 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
1676 sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA; 1677 set_opt(sbi->s_mount_opt, ORDERED_DATA);
1677 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK) 1678 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
1678 sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA; 1679 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
1679 1680
1680 if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC) 1681 if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
1681 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 1682 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -1694,7 +1695,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1694 goto failed_mount; 1695 goto failed_mount;
1695 1696
1696 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 1697 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
1697 ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 1698 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
1698 1699
1699 if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && 1700 if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
1700 (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || 1701 (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@ -2561,11 +2562,11 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2561 goto restore_opts; 2562 goto restore_opts;
2562 } 2563 }
2563 2564
2564 if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) 2565 if (test_opt(sb, ABORT))
2565 ext3_abort(sb, __func__, "Abort forced by user"); 2566 ext3_abort(sb, __func__, "Abort forced by user");
2566 2567
2567 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 2568 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
2568 ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 2569 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
2569 2570
2570 es = sbi->s_es; 2571 es = sbi->s_es;
2571 2572
@@ -2573,7 +2574,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2573 2574
2574 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || 2575 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
2575 n_blocks_count > le32_to_cpu(es->s_blocks_count)) { 2576 n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
2576 if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) { 2577 if (test_opt(sb, ABORT)) {
2577 err = -EROFS; 2578 err = -EROFS;
2578 goto restore_opts; 2579 goto restore_opts;
2579 } 2580 }
@@ -2734,7 +2735,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
2734 * Process 1 Process 2 2735 * Process 1 Process 2
2735 * ext3_create() quota_sync() 2736 * ext3_create() quota_sync()
2736 * journal_start() write_dquot() 2737 * journal_start() write_dquot()
2737 * vfs_dq_init() down(dqio_mutex) 2738 * dquot_initialize() down(dqio_mutex)
2738 * down(dqio_mutex) journal_start() 2739 * down(dqio_mutex) journal_start()
2739 * 2740 *
2740 */ 2741 */
@@ -2942,9 +2943,7 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
2942 sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); 2943 sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
2943 int err = 0; 2944 int err = 0;
2944 int offset = off & (sb->s_blocksize - 1); 2945 int offset = off & (sb->s_blocksize - 1);
2945 int tocopy;
2946 int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL; 2946 int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL;
2947 size_t towrite = len;
2948 struct buffer_head *bh; 2947 struct buffer_head *bh;
2949 handle_t *handle = journal_current_handle(); 2948 handle_t *handle = journal_current_handle();
2950 2949
@@ -2955,53 +2954,54 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
2955 (unsigned long long)off, (unsigned long long)len); 2954 (unsigned long long)off, (unsigned long long)len);
2956 return -EIO; 2955 return -EIO;
2957 } 2956 }
2957
2958 /*
2959 * Since we account only one data block in transaction credits,
2960 * then it is impossible to cross a block boundary.
2961 */
2962 if (sb->s_blocksize - offset < len) {
2963 ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
2964 " cancelled because not block aligned",
2965 (unsigned long long)off, (unsigned long long)len);
2966 return -EIO;
2967 }
2958 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); 2968 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2959 while (towrite > 0) { 2969 bh = ext3_bread(handle, inode, blk, 1, &err);
2960 tocopy = sb->s_blocksize - offset < towrite ? 2970 if (!bh)
2961 sb->s_blocksize - offset : towrite; 2971 goto out;
2962 bh = ext3_bread(handle, inode, blk, 1, &err); 2972 if (journal_quota) {
2963 if (!bh) 2973 err = ext3_journal_get_write_access(handle, bh);
2974 if (err) {
2975 brelse(bh);
2964 goto out; 2976 goto out;
2965 if (journal_quota) {
2966 err = ext3_journal_get_write_access(handle, bh);
2967 if (err) {
2968 brelse(bh);
2969 goto out;
2970 }
2971 }
2972 lock_buffer(bh);
2973 memcpy(bh->b_data+offset, data, tocopy);
2974 flush_dcache_page(bh->b_page);
2975 unlock_buffer(bh);
2976 if (journal_quota)
2977 err = ext3_journal_dirty_metadata(handle, bh);
2978 else {
2979 /* Always do at least ordered writes for quotas */
2980 err = ext3_journal_dirty_data(handle, bh);
2981 mark_buffer_dirty(bh);
2982 } 2977 }
2983 brelse(bh);
2984 if (err)
2985 goto out;
2986 offset = 0;
2987 towrite -= tocopy;
2988 data += tocopy;
2989 blk++;
2990 } 2978 }
2979 lock_buffer(bh);
2980 memcpy(bh->b_data+offset, data, len);
2981 flush_dcache_page(bh->b_page);
2982 unlock_buffer(bh);
2983 if (journal_quota)
2984 err = ext3_journal_dirty_metadata(handle, bh);
2985 else {
2986 /* Always do at least ordered writes for quotas */
2987 err = ext3_journal_dirty_data(handle, bh);
2988 mark_buffer_dirty(bh);
2989 }
2990 brelse(bh);
2991out: 2991out:
2992 if (len == towrite) { 2992 if (err) {
2993 mutex_unlock(&inode->i_mutex); 2993 mutex_unlock(&inode->i_mutex);
2994 return err; 2994 return err;
2995 } 2995 }
2996 if (inode->i_size < off+len-towrite) { 2996 if (inode->i_size < off + len) {
2997 i_size_write(inode, off+len-towrite); 2997 i_size_write(inode, off + len);
2998 EXT3_I(inode)->i_disksize = inode->i_size; 2998 EXT3_I(inode)->i_disksize = inode->i_size;
2999 } 2999 }
3000 inode->i_version++; 3000 inode->i_version++;
3001 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 3001 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
3002 ext3_mark_inode_dirty(handle, inode); 3002 ext3_mark_inode_dirty(handle, inode);
3003 mutex_unlock(&inode->i_mutex); 3003 mutex_unlock(&inode->i_mutex);
3004 return len - towrite; 3004 return len;
3005} 3005}
3006 3006
3007#endif 3007#endif
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
index ff7b4ccd8983..7c4898207776 100644
--- a/fs/ext3/symlink.c
+++ b/fs/ext3/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
34 .readlink = generic_readlink, 34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light, 35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link, 36 .put_link = page_put_link,
37 .setattr = ext3_setattr,
37#ifdef CONFIG_EXT3_FS_XATTR 38#ifdef CONFIG_EXT3_FS_XATTR
38 .setxattr = generic_setxattr, 39 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr, 40 .getxattr = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
45const struct inode_operations ext3_fast_symlink_inode_operations = { 46const struct inode_operations ext3_fast_symlink_inode_operations = {
46 .readlink = generic_readlink, 47 .readlink = generic_readlink,
47 .follow_link = ext3_follow_link, 48 .follow_link = ext3_follow_link,
49 .setattr = ext3_setattr,
48#ifdef CONFIG_EXT3_FS_XATTR 50#ifdef CONFIG_EXT3_FS_XATTR
49 .setxattr = generic_setxattr, 51 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr, 52 .getxattr = generic_getxattr,
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 66895ccf76c7..534a94c3a933 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -274,7 +274,7 @@ ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
274 void *end; 274 void *end;
275 int error; 275 int error;
276 276
277 if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) 277 if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
278 return -ENODATA; 278 return -ENODATA;
279 error = ext3_get_inode_loc(inode, &iloc); 279 error = ext3_get_inode_loc(inode, &iloc);
280 if (error) 280 if (error)
@@ -403,7 +403,7 @@ ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
403 void *end; 403 void *end;
404 int error; 404 int error;
405 405
406 if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) 406 if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
407 return 0; 407 return 0;
408 error = ext3_get_inode_loc(inode, &iloc); 408 error = ext3_get_inode_loc(inode, &iloc);
409 if (error) 409 if (error)
@@ -500,7 +500,7 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode,
500 error = ext3_journal_dirty_metadata(handle, bh); 500 error = ext3_journal_dirty_metadata(handle, bh);
501 if (IS_SYNC(inode)) 501 if (IS_SYNC(inode))
502 handle->h_sync = 1; 502 handle->h_sync = 1;
503 vfs_dq_free_block(inode, 1); 503 dquot_free_block(inode, 1);
504 ea_bdebug(bh, "refcount now=%d; releasing", 504 ea_bdebug(bh, "refcount now=%d; releasing",
505 le32_to_cpu(BHDR(bh)->h_refcount)); 505 le32_to_cpu(BHDR(bh)->h_refcount));
506 if (ce) 506 if (ce)
@@ -775,8 +775,8 @@ inserted:
775 else { 775 else {
776 /* The old block is released after updating 776 /* The old block is released after updating
777 the inode. */ 777 the inode. */
778 error = -EDQUOT; 778 error = dquot_alloc_block(inode, 1);
779 if (vfs_dq_alloc_block(inode, 1)) 779 if (error)
780 goto cleanup; 780 goto cleanup;
781 error = ext3_journal_get_write_access(handle, 781 error = ext3_journal_get_write_access(handle,
782 new_bh); 782 new_bh);
@@ -850,7 +850,7 @@ cleanup:
850 return error; 850 return error;
851 851
852cleanup_dquot: 852cleanup_dquot:
853 vfs_dq_free_block(inode, 1); 853 dquot_free_block(inode, 1);
854 goto cleanup; 854 goto cleanup;
855 855
856bad_block: 856bad_block:
@@ -882,7 +882,7 @@ ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
882 is->s.base = is->s.first = IFIRST(header); 882 is->s.base = is->s.first = IFIRST(header);
883 is->s.here = is->s.first; 883 is->s.here = is->s.first;
884 is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; 884 is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
885 if (EXT3_I(inode)->i_state & EXT3_STATE_XATTR) { 885 if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) {
886 error = ext3_xattr_check_names(IFIRST(header), is->s.end); 886 error = ext3_xattr_check_names(IFIRST(header), is->s.end);
887 if (error) 887 if (error)
888 return error; 888 return error;
@@ -914,10 +914,10 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
914 header = IHDR(inode, ext3_raw_inode(&is->iloc)); 914 header = IHDR(inode, ext3_raw_inode(&is->iloc));
915 if (!IS_LAST_ENTRY(s->first)) { 915 if (!IS_LAST_ENTRY(s->first)) {
916 header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); 916 header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
917 EXT3_I(inode)->i_state |= EXT3_STATE_XATTR; 917 ext3_set_inode_state(inode, EXT3_STATE_XATTR);
918 } else { 918 } else {
919 header->h_magic = cpu_to_le32(0); 919 header->h_magic = cpu_to_le32(0);
920 EXT3_I(inode)->i_state &= ~EXT3_STATE_XATTR; 920 ext3_clear_inode_state(inode, EXT3_STATE_XATTR);
921 } 921 }
922 return 0; 922 return 0;
923} 923}
@@ -967,10 +967,10 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
967 if (error) 967 if (error)
968 goto cleanup; 968 goto cleanup;
969 969
970 if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) { 970 if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) {
971 struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc); 971 struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
972 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); 972 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
973 EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW; 973 ext3_clear_inode_state(inode, EXT3_STATE_NEW);
974 } 974 }
975 975
976 error = ext3_xattr_ibody_find(inode, &i, &is); 976 error = ext3_xattr_ibody_find(inode, &i, &is);
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 474348788dd9..3af91f476dff 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -4,6 +4,7 @@
4 */ 4 */
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/slab.h>
7#include <linux/string.h> 8#include <linux/string.h>
8#include <linux/fs.h> 9#include <linux/fs.h>
9#include <linux/ext3_jbd.h> 10#include <linux/ext3_jbd.h>
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 9acf7e808139..9ed1bb1f319f 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -28,6 +28,7 @@ config EXT4_FS
28 28
29config EXT4_USE_FOR_EXT23 29config EXT4_USE_FOR_EXT23
30 bool "Use ext4 for ext2/ext3 file systems" 30 bool "Use ext4 for ext2/ext3 file systems"
31 depends on EXT4_FS
31 depends on EXT3_FS=n || EXT2_FS=n 32 depends on EXT3_FS=n || EXT2_FS=n
32 default y 33 default y
33 help 34 help
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 22bc7435d913..d2f37a5516c7 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -97,8 +97,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
97 /* If checksum is bad mark all blocks used to prevent allocation 97 /* If checksum is bad mark all blocks used to prevent allocation
98 * essentially implementing a per-group read-only flag. */ 98 * essentially implementing a per-group read-only flag. */
99 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 99 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
100 ext4_error(sb, __func__, 100 ext4_error(sb, "Checksum bad for group %u",
101 "Checksum bad for group %u", block_group); 101 block_group);
102 ext4_free_blks_set(sb, gdp, 0); 102 ext4_free_blks_set(sb, gdp, 0);
103 ext4_free_inodes_set(sb, gdp, 0); 103 ext4_free_inodes_set(sb, gdp, 0);
104 ext4_itable_unused_set(sb, gdp, 0); 104 ext4_itable_unused_set(sb, gdp, 0);
@@ -130,8 +130,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
130 * to make sure we calculate the right free blocks 130 * to make sure we calculate the right free blocks
131 */ 131 */
132 group_blocks = ext4_blocks_count(sbi->s_es) - 132 group_blocks = ext4_blocks_count(sbi->s_es) -
133 le32_to_cpu(sbi->s_es->s_first_data_block) - 133 ext4_group_first_block_no(sb, ngroups - 1);
134 (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
135 } else { 134 } else {
136 group_blocks = EXT4_BLOCKS_PER_GROUP(sb); 135 group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
137 } 136 }
@@ -189,9 +188,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
189 * when a file system is mounted (see ext4_fill_super). 188 * when a file system is mounted (see ext4_fill_super).
190 */ 189 */
191 190
192
193#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
194
195/** 191/**
196 * ext4_get_group_desc() -- load group descriptor from disk 192 * ext4_get_group_desc() -- load group descriptor from disk
197 * @sb: super block 193 * @sb: super block
@@ -210,10 +206,8 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
210 struct ext4_sb_info *sbi = EXT4_SB(sb); 206 struct ext4_sb_info *sbi = EXT4_SB(sb);
211 207
212 if (block_group >= ngroups) { 208 if (block_group >= ngroups) {
213 ext4_error(sb, "ext4_get_group_desc", 209 ext4_error(sb, "block_group >= groups_count - block_group = %u,"
214 "block_group >= groups_count - " 210 " groups_count = %u", block_group, ngroups);
215 "block_group = %u, groups_count = %u",
216 block_group, ngroups);
217 211
218 return NULL; 212 return NULL;
219 } 213 }
@@ -221,8 +215,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
221 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); 215 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
222 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); 216 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
223 if (!sbi->s_group_desc[group_desc]) { 217 if (!sbi->s_group_desc[group_desc]) {
224 ext4_error(sb, "ext4_get_group_desc", 218 ext4_error(sb, "Group descriptor not loaded - "
225 "Group descriptor not loaded - "
226 "block_group = %u, group_desc = %u, desc = %u", 219 "block_group = %u, group_desc = %u, desc = %u",
227 block_group, group_desc, offset); 220 block_group, group_desc, offset);
228 return NULL; 221 return NULL;
@@ -282,9 +275,7 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
282 return 1; 275 return 1;
283 276
284err_out: 277err_out:
285 ext4_error(sb, __func__, 278 ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
286 "Invalid block bitmap - "
287 "block_group = %d, block = %llu",
288 block_group, bitmap_blk); 279 block_group, bitmap_blk);
289 return 0; 280 return 0;
290} 281}
@@ -311,8 +302,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
311 bitmap_blk = ext4_block_bitmap(sb, desc); 302 bitmap_blk = ext4_block_bitmap(sb, desc);
312 bh = sb_getblk(sb, bitmap_blk); 303 bh = sb_getblk(sb, bitmap_blk);
313 if (unlikely(!bh)) { 304 if (unlikely(!bh)) {
314 ext4_error(sb, __func__, 305 ext4_error(sb, "Cannot read block bitmap - "
315 "Cannot read block bitmap - "
316 "block_group = %u, block_bitmap = %llu", 306 "block_group = %u, block_bitmap = %llu",
317 block_group, bitmap_blk); 307 block_group, bitmap_blk);
318 return NULL; 308 return NULL;
@@ -354,8 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
354 set_bitmap_uptodate(bh); 344 set_bitmap_uptodate(bh);
355 if (bh_submit_read(bh) < 0) { 345 if (bh_submit_read(bh) < 0) {
356 put_bh(bh); 346 put_bh(bh);
357 ext4_error(sb, __func__, 347 ext4_error(sb, "Cannot read block bitmap - "
358 "Cannot read block bitmap - "
359 "block_group = %u, block_bitmap = %llu", 348 "block_group = %u, block_bitmap = %llu",
360 block_group, bitmap_blk); 349 block_group, bitmap_blk);
361 return NULL; 350 return NULL;
@@ -419,8 +408,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
419 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 408 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
420 in_range(block + count - 1, ext4_inode_table(sb, desc), 409 in_range(block + count - 1, ext4_inode_table(sb, desc),
421 sbi->s_itb_per_group)) { 410 sbi->s_itb_per_group)) {
422 ext4_error(sb, __func__, 411 ext4_error(sb, "Adding blocks in system zones - "
423 "Adding blocks in system zones - "
424 "Block = %llu, count = %lu", 412 "Block = %llu, count = %lu",
425 block, count); 413 block, count);
426 goto error_return; 414 goto error_return;
@@ -453,8 +441,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
453 BUFFER_TRACE(bitmap_bh, "clear bit"); 441 BUFFER_TRACE(bitmap_bh, "clear bit");
454 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), 442 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
455 bit + i, bitmap_bh->b_data)) { 443 bit + i, bitmap_bh->b_data)) {
456 ext4_error(sb, __func__, 444 ext4_error(sb, "bit already cleared for block %llu",
457 "bit already cleared for block %llu",
458 (ext4_fsblk_t)(block + i)); 445 (ext4_fsblk_t)(block + i));
459 BUFFER_TRACE(bitmap_bh, "bit already cleared"); 446 BUFFER_TRACE(bitmap_bh, "bit already cleared");
460 } else { 447 } else {
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 4df8621ec31c..538c48655084 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -16,9 +16,9 @@
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/swap.h> 17#include <linux/swap.h>
18#include <linux/pagemap.h> 18#include <linux/pagemap.h>
19#include <linux/version.h>
20#include <linux/blkdev.h> 19#include <linux/blkdev.h>
21#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/slab.h>
22#include "ext4.h" 22#include "ext4.h"
23 23
24struct ext4_system_zone { 24struct ext4_system_zone {
@@ -206,14 +206,14 @@ void ext4_release_system_zone(struct super_block *sb)
206 entry = rb_entry(n, struct ext4_system_zone, node); 206 entry = rb_entry(n, struct ext4_system_zone, node);
207 kmem_cache_free(ext4_system_zone_cachep, entry); 207 kmem_cache_free(ext4_system_zone_cachep, entry);
208 if (!parent) 208 if (!parent)
209 EXT4_SB(sb)->system_blks.rb_node = NULL; 209 EXT4_SB(sb)->system_blks = RB_ROOT;
210 else if (parent->rb_left == n) 210 else if (parent->rb_left == n)
211 parent->rb_left = NULL; 211 parent->rb_left = NULL;
212 else if (parent->rb_right == n) 212 else if (parent->rb_right == n)
213 parent->rb_right = NULL; 213 parent->rb_right = NULL;
214 n = parent; 214 n = parent;
215 } 215 }
216 EXT4_SB(sb)->system_blks.rb_node = NULL; 216 EXT4_SB(sb)->system_blks = RB_ROOT;
217} 217}
218 218
219/* 219/*
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 9dc93168e262..86cb6d86a048 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,10 +83,12 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
83 error_msg = "inode out of bounds"; 83 error_msg = "inode out of bounds";
84 84
85 if (error_msg != NULL) 85 if (error_msg != NULL)
86 ext4_error(dir->i_sb, function, 86 __ext4_error(dir->i_sb, function,
87 "bad entry in directory #%lu: %s - " 87 "bad entry in directory #%lu: %s - block=%llu"
88 "offset=%u, inode=%u, rec_len=%d, name_len=%d", 88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
89 dir->i_ino, error_msg, offset, 89 dir->i_ino, error_msg,
90 (unsigned long long) bh->b_blocknr,
91 (unsigned) (offset%bh->b_size), offset,
90 le32_to_cpu(de->inode), 92 le32_to_cpu(de->inode),
91 rlen, de->name_len); 93 rlen, de->name_len);
92 return error_msg == NULL ? 1 : 0; 94 return error_msg == NULL ? 1 : 0;
@@ -150,7 +152,7 @@ static int ext4_readdir(struct file *filp,
150 */ 152 */
151 if (!bh) { 153 if (!bh) {
152 if (!dir_has_error) { 154 if (!dir_has_error) {
153 ext4_error(sb, __func__, "directory #%lu " 155 ext4_error(sb, "directory #%lu "
154 "contains a hole at offset %Lu", 156 "contains a hole at offset %Lu",
155 inode->i_ino, 157 inode->i_ino,
156 (unsigned long long) filp->f_pos); 158 (unsigned long long) filp->f_pos);
@@ -303,7 +305,7 @@ static void free_rb_tree_fname(struct rb_root *root)
303 kfree(old); 305 kfree(old);
304 } 306 }
305 if (!parent) 307 if (!parent)
306 root->rb_node = NULL; 308 *root = RB_ROOT;
307 else if (parent->rb_left == n) 309 else if (parent->rb_left == n)
308 parent->rb_left = NULL; 310 parent->rb_left = NULL;
309 else if (parent->rb_right == n) 311 else if (parent->rb_right == n)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 56f9271ee8cc..bf938cf7c5f0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -53,6 +53,12 @@
53#define ext4_debug(f, a...) do {} while (0) 53#define ext4_debug(f, a...) do {} while (0)
54#endif 54#endif
55 55
56#define EXT4_ERROR_INODE(inode, fmt, a...) \
57 ext4_error_inode(__func__, (inode), (fmt), ## a);
58
59#define EXT4_ERROR_FILE(file, fmt, a...) \
60 ext4_error_file(__func__, (file), (fmt), ## a);
61
56/* data type for block offset of block group */ 62/* data type for block offset of block group */
57typedef int ext4_grpblk_t; 63typedef int ext4_grpblk_t;
58 64
@@ -133,14 +139,14 @@ struct mpage_da_data {
133 int pages_written; 139 int pages_written;
134 int retval; 140 int retval;
135}; 141};
136#define DIO_AIO_UNWRITTEN 0x1 142#define EXT4_IO_UNWRITTEN 0x1
137typedef struct ext4_io_end { 143typedef struct ext4_io_end {
138 struct list_head list; /* per-file finished AIO list */ 144 struct list_head list; /* per-file finished AIO list */
139 struct inode *inode; /* file being written to */ 145 struct inode *inode; /* file being written to */
140 unsigned int flag; /* unwritten or not */ 146 unsigned int flag; /* unwritten or not */
141 int error; /* I/O error code */ 147 struct page *page; /* page struct for buffer write */
142 ext4_lblk_t offset; /* offset in the file */ 148 loff_t offset; /* offset in the file */
143 size_t size; /* size of the extent */ 149 ssize_t size; /* size of the extent */
144 struct work_struct work; /* data work queue */ 150 struct work_struct work; /* data work queue */
145} ext4_io_end_t; 151} ext4_io_end_t;
146 152
@@ -284,10 +290,12 @@ struct flex_groups {
284#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ 290#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
285#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ 291#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
286#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ 292#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
293#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
294#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
287#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 295#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
288 296
289#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 297#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
290#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ 298#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */
291 299
292/* Flags that should be inherited by new inodes from their parent. */ 300/* Flags that should be inherited by new inodes from their parent. */
293#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ 301#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
@@ -313,17 +321,6 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
313 return flags & EXT4_OTHER_FLMASK; 321 return flags & EXT4_OTHER_FLMASK;
314} 322}
315 323
316/*
317 * Inode dynamic state flags
318 */
319#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */
320#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
321#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
322#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
323#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
324#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
325#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
326
327/* Used to pass group descriptor data when online resize is done */ 324/* Used to pass group descriptor data when online resize is done */
328struct ext4_new_group_input { 325struct ext4_new_group_input {
329 __u32 group; /* Group number for this data */ 326 __u32 group; /* Group number for this data */
@@ -361,25 +358,23 @@ struct ext4_new_group_data {
361 so set the magic i_delalloc_reserve_flag after taking the 358 so set the magic i_delalloc_reserve_flag after taking the
362 inode allocation semaphore for */ 359 inode allocation semaphore for */
363#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 360#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
364 /* Call ext4_da_update_reserve_space() after successfully
365 allocating the blocks */
366#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
367 /* caller is from the direct IO path, request to creation of an 361 /* caller is from the direct IO path, request to creation of an
368 unitialized extents if not allocated, split the uninitialized 362 unitialized extents if not allocated, split the uninitialized
369 extent if blocks has been preallocated already*/ 363 extent if blocks has been preallocated already*/
370#define EXT4_GET_BLOCKS_DIO 0x0010 364#define EXT4_GET_BLOCKS_PRE_IO 0x0008
371#define EXT4_GET_BLOCKS_CONVERT 0x0020 365#define EXT4_GET_BLOCKS_CONVERT 0x0010
372#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ 366#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
367 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
368 /* Convert extent to initialized after IO complete */
369#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
373 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) 370 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
374 /* Convert extent to initialized after direct IO complete */
375#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
376 EXT4_GET_BLOCKS_DIO_CREATE_EXT)
377 371
378/* 372/*
379 * Flags used by ext4_free_blocks 373 * Flags used by ext4_free_blocks
380 */ 374 */
381#define EXT4_FREE_BLOCKS_METADATA 0x0001 375#define EXT4_FREE_BLOCKS_METADATA 0x0001
382#define EXT4_FREE_BLOCKS_FORGET 0x0002 376#define EXT4_FREE_BLOCKS_FORGET 0x0002
377#define EXT4_FREE_BLOCKS_VALIDATED 0x0004
383 378
384/* 379/*
385 * ioctl commands 380 * ioctl commands
@@ -633,7 +628,7 @@ struct ext4_inode_info {
633 * near to their parent directory's inode. 628 * near to their parent directory's inode.
634 */ 629 */
635 ext4_group_t i_block_group; 630 ext4_group_t i_block_group;
636 __u32 i_state; /* Dynamic state flags for ext4 */ 631 unsigned long i_state_flags; /* Dynamic state flags */
637 632
638 ext4_lblk_t i_dir_start_lookup; 633 ext4_lblk_t i_dir_start_lookup;
639#ifdef CONFIG_EXT4_FS_XATTR 634#ifdef CONFIG_EXT4_FS_XATTR
@@ -699,6 +694,8 @@ struct ext4_inode_info {
699 unsigned int i_reserved_meta_blocks; 694 unsigned int i_reserved_meta_blocks;
700 unsigned int i_allocated_meta_blocks; 695 unsigned int i_allocated_meta_blocks;
701 unsigned short i_delalloc_reserved_flag; 696 unsigned short i_delalloc_reserved_flag;
697 sector_t i_da_metadata_calc_last_lblock;
698 int i_da_metadata_calc_len;
702 699
703 /* on-disk additional length */ 700 /* on-disk additional length */
704 __u16 i_extra_isize; 701 __u16 i_extra_isize;
@@ -709,8 +706,9 @@ struct ext4_inode_info {
709 qsize_t i_reserved_quota; 706 qsize_t i_reserved_quota;
710#endif 707#endif
711 708
712 /* completed async DIOs that might need unwritten extents handling */ 709 /* completed IOs that might need unwritten extents handling */
713 struct list_head i_aio_dio_complete_list; 710 struct list_head i_completed_io_list;
711 spinlock_t i_completed_io_lock;
714 /* current io_end structure for async DIO write*/ 712 /* current io_end structure for async DIO write*/
715 ext4_io_end_t *cur_aio_dio; 713 ext4_io_end_t *cur_aio_dio;
716 714
@@ -761,6 +759,7 @@ struct ext4_inode_info {
761#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 759#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
762#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 760#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
763#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 761#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
762#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
764#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 763#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
765#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 764#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
766#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 765#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
@@ -1015,7 +1014,7 @@ struct ext4_sb_info {
1015 atomic_t s_lock_busy; 1014 atomic_t s_lock_busy;
1016 1015
1017 /* locality groups */ 1016 /* locality groups */
1018 struct ext4_locality_group *s_locality_groups; 1017 struct ext4_locality_group __percpu *s_locality_groups;
1019 1018
1020 /* for write statistics */ 1019 /* for write statistics */
1021 unsigned long s_sectors_written_start; 1020 unsigned long s_sectors_written_start;
@@ -1051,6 +1050,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
1051 (ino >= EXT4_FIRST_INO(sb) && 1050 (ino >= EXT4_FIRST_INO(sb) &&
1052 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); 1051 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
1053} 1052}
1053
1054/*
1055 * Inode dynamic state flags
1056 */
1057enum {
1058 EXT4_STATE_JDATA, /* journaled data exists */
1059 EXT4_STATE_NEW, /* inode is newly created */
1060 EXT4_STATE_XATTR, /* has in-inode xattrs */
1061 EXT4_STATE_NO_EXPAND, /* No space for expansion */
1062 EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
1063 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
1064 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1065};
1066
1067static inline int ext4_test_inode_state(struct inode *inode, int bit)
1068{
1069 return test_bit(bit, &EXT4_I(inode)->i_state_flags);
1070}
1071
1072static inline void ext4_set_inode_state(struct inode *inode, int bit)
1073{
1074 set_bit(bit, &EXT4_I(inode)->i_state_flags);
1075}
1076
1077static inline void ext4_clear_inode_state(struct inode *inode, int bit)
1078{
1079 clear_bit(bit, &EXT4_I(inode)->i_state_flags);
1080}
1054#else 1081#else
1055/* Assume that user mode programs are passing in an ext4fs superblock, not 1082/* Assume that user mode programs are passing in an ext4fs superblock, not
1056 * a kernel struct super_block. This will allow us to call the feature-test 1083 * a kernel struct super_block. This will allow us to call the feature-test
@@ -1127,6 +1154,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
1127#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 1154#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080
1128#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 1155#define EXT4_FEATURE_INCOMPAT_MMP 0x0100
1129#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 1156#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
1157#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
1158#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
1130 1159
1131#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR 1160#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
1132#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ 1161#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1417,7 +1446,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
1417 struct buffer_head *bh_result, int create); 1446 struct buffer_head *bh_result, int create);
1418 1447
1419extern struct inode *ext4_iget(struct super_block *, unsigned long); 1448extern struct inode *ext4_iget(struct super_block *, unsigned long);
1420extern int ext4_write_inode(struct inode *, int); 1449extern int ext4_write_inode(struct inode *, struct writeback_control *);
1421extern int ext4_setattr(struct dentry *, struct iattr *); 1450extern int ext4_setattr(struct dentry *, struct iattr *);
1422extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 1451extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1423 struct kstat *stat); 1452 struct kstat *stat);
@@ -1440,7 +1469,9 @@ extern int ext4_block_truncate_page(handle_t *handle,
1440 struct address_space *mapping, loff_t from); 1469 struct address_space *mapping, loff_t from);
1441extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1470extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1442extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1471extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1443extern int flush_aio_dio_completed_IO(struct inode *inode); 1472extern int flush_completed_IO(struct inode *inode);
1473extern void ext4_da_update_reserve_space(struct inode *inode,
1474 int used, int quota_claim);
1444/* ioctl.c */ 1475/* ioctl.c */
1445extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1476extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1446extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); 1477extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1464,13 +1495,20 @@ extern int ext4_group_extend(struct super_block *sb,
1464 ext4_fsblk_t n_blocks_count); 1495 ext4_fsblk_t n_blocks_count);
1465 1496
1466/* super.c */ 1497/* super.c */
1467extern void ext4_error(struct super_block *, const char *, const char *, ...) 1498extern void __ext4_error(struct super_block *, const char *, const char *, ...)
1499 __attribute__ ((format (printf, 3, 4)));
1500#define ext4_error(sb, message...) __ext4_error(sb, __func__, ## message)
1501extern void ext4_error_inode(const char *, struct inode *, const char *, ...)
1502 __attribute__ ((format (printf, 3, 4)));
1503extern void ext4_error_file(const char *, struct file *, const char *, ...)
1468 __attribute__ ((format (printf, 3, 4))); 1504 __attribute__ ((format (printf, 3, 4)));
1469extern void __ext4_std_error(struct super_block *, const char *, int); 1505extern void __ext4_std_error(struct super_block *, const char *, int);
1470extern void ext4_abort(struct super_block *, const char *, const char *, ...) 1506extern void ext4_abort(struct super_block *, const char *, const char *, ...)
1471 __attribute__ ((format (printf, 3, 4))); 1507 __attribute__ ((format (printf, 3, 4)));
1472extern void ext4_warning(struct super_block *, const char *, const char *, ...) 1508extern void __ext4_warning(struct super_block *, const char *,
1509 const char *, ...)
1473 __attribute__ ((format (printf, 3, 4))); 1510 __attribute__ ((format (printf, 3, 4)));
1511#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message)
1474extern void ext4_msg(struct super_block *, const char *, const char *, ...) 1512extern void ext4_msg(struct super_block *, const char *, const char *, ...)
1475 __attribute__ ((format (printf, 3, 4))); 1513 __attribute__ ((format (printf, 3, 4)));
1476extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, 1514extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
@@ -1743,7 +1781,7 @@ extern void ext4_ext_release(struct super_block *);
1743extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1781extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1744 loff_t len); 1782 loff_t len);
1745extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 1783extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1746 loff_t len); 1784 ssize_t len);
1747extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1785extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1748 sector_t block, unsigned int max_blocks, 1786 sector_t block, unsigned int max_blocks,
1749 struct buffer_head *bh, int flags); 1787 struct buffer_head *bh, int flags);
@@ -1755,6 +1793,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
1755 __u64 len, __u64 *moved_len); 1793 __u64 len, __u64 *moved_len);
1756 1794
1757 1795
1796/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
1797enum ext4_state_bits {
1798 BH_Uninit /* blocks are allocated but uninitialized on disk */
1799 = BH_JBDPrivateStart,
1800};
1801
1802BUFFER_FNS(Uninit, uninit)
1803TAS_BUFFER_FNS(Uninit, uninit)
1804
1758/* 1805/*
1759 * Add new method to test wether block and inode bitmaps are properly 1806 * Add new method to test wether block and inode bitmaps are properly
1760 * initialized. With uninit_bg reading the block from disk is not enough 1807 * initialized. With uninit_bg reading the block from disk is not enough
@@ -1772,6 +1819,8 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
1772 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); 1819 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
1773} 1820}
1774 1821
1822#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
1823
1775#endif /* __KERNEL__ */ 1824#endif /* __KERNEL__ */
1776 1825
1777#endif /* _EXT4_H */ 1826#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 2ca686454e87..bdb6ce7e2eb4 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -225,7 +225,8 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
225 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); 225 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
226} 226}
227 227
228extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); 228extern int ext4_ext_calc_metadata_amount(struct inode *inode,
229 sector_t lblocks);
229extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); 230extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
230extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); 231extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
231extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); 232extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index b57e5c711b6d..53d2764d71ca 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -125,14 +125,14 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
125 ext4_journal_abort_handle(where, __func__, bh, 125 ext4_journal_abort_handle(where, __func__, bh,
126 handle, err); 126 handle, err);
127 } else { 127 } else {
128 if (inode && bh) 128 if (inode)
129 mark_buffer_dirty_inode(bh, inode); 129 mark_buffer_dirty_inode(bh, inode);
130 else 130 else
131 mark_buffer_dirty(bh); 131 mark_buffer_dirty(bh);
132 if (inode && inode_needs_sync(inode)) { 132 if (inode && inode_needs_sync(inode)) {
133 sync_dirty_buffer(bh); 133 sync_dirty_buffer(bh);
134 if (buffer_req(bh) && !buffer_uptodate(bh)) { 134 if (buffer_req(bh) && !buffer_uptodate(bh)) {
135 ext4_error(inode->i_sb, __func__, 135 ext4_error(inode->i_sb,
136 "IO error syncing inode, " 136 "IO error syncing inode, "
137 "inode=%lu, block=%llu", 137 "inode=%lu, block=%llu",
138 inode->i_ino, 138 inode->i_ino,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 05eca817d704..b79ad5126468 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -304,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode)
304 return 0; 304 return 0;
305} 305}
306 306
307/*
308 * This function controls whether or not we should try to go down the
309 * dioread_nolock code paths, which makes it safe to avoid taking
310 * i_mutex for direct I/O reads. This only works for extent-based
311 * files, and it doesn't work for nobh or if data journaling is
312 * enabled, since the dioread_nolock code uses b_private to pass
313 * information back to the I/O completion handler, and this conflicts
314 * with the jbd's use of b_private.
315 */
316static inline int ext4_should_dioread_nolock(struct inode *inode)
317{
318 if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
319 return 0;
320 if (test_opt(inode->i_sb, NOBH))
321 return 0;
322 if (!S_ISREG(inode->i_mode))
323 return 0;
324 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
325 return 0;
326 if (ext4_should_journal_data(inode))
327 return 0;
328 return 1;
329}
330
307#endif /* _EXT4_JBD2_H */ 331#endif /* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3a7928f825e4..236b834b4ca8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -195,8 +195,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
195 if (S_ISREG(inode->i_mode)) 195 if (S_ISREG(inode->i_mode))
196 block_group++; 196 block_group++;
197 } 197 }
198 bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + 198 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
199 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
200 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 199 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
201 200
202 /* 201 /*
@@ -296,29 +295,44 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
296 * to allocate @blocks 295 * to allocate @blocks
297 * Worse case is one block per extent 296 * Worse case is one block per extent
298 */ 297 */
299int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) 298int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
300{ 299{
301 int lcap, icap, rcap, leafs, idxs, num; 300 struct ext4_inode_info *ei = EXT4_I(inode);
302 int newextents = blocks; 301 int idxs, num = 0;
303
304 rcap = ext4_ext_space_root_idx(inode, 0);
305 lcap = ext4_ext_space_block(inode, 0);
306 icap = ext4_ext_space_block_idx(inode, 0);
307 302
308 /* number of new leaf blocks needed */ 303 idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
309 num = leafs = (newextents + lcap - 1) / lcap; 304 / sizeof(struct ext4_extent_idx));
310 305
311 /* 306 /*
312 * Worse case, we need separate index block(s) 307 * If the new delayed allocation block is contiguous with the
313 * to link all new leaf blocks 308 * previous da block, it can share index blocks with the
309 * previous block, so we only need to allocate a new index
310 * block every idxs leaf blocks. At ldxs**2 blocks, we need
311 * an additional index block, and at ldxs**3 blocks, yet
312 * another index blocks.
314 */ 313 */
315 idxs = (leafs + icap - 1) / icap; 314 if (ei->i_da_metadata_calc_len &&
316 do { 315 ei->i_da_metadata_calc_last_lblock+1 == lblock) {
317 num += idxs; 316 if ((ei->i_da_metadata_calc_len % idxs) == 0)
318 idxs = (idxs + icap - 1) / icap; 317 num++;
319 } while (idxs > rcap); 318 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
319 num++;
320 if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
321 num++;
322 ei->i_da_metadata_calc_len = 0;
323 } else
324 ei->i_da_metadata_calc_len++;
325 ei->i_da_metadata_calc_last_lblock++;
326 return num;
327 }
320 328
321 return num; 329 /*
330 * In the worst case we need a new set of index blocks at
331 * every level of the inode's extent tree.
332 */
333 ei->i_da_metadata_calc_len = 1;
334 ei->i_da_metadata_calc_last_lblock = lblock;
335 return ext_depth(inode) + 1;
322} 336}
323 337
324static int 338static int
@@ -425,7 +439,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
425 return 0; 439 return 0;
426 440
427corrupted: 441corrupted:
428 ext4_error(inode->i_sb, function, 442 __ext4_error(inode->i_sb, function,
429 "bad header/extent in inode #%lu: %s - magic %x, " 443 "bad header/extent in inode #%lu: %s - magic %x, "
430 "entries %u, max %u(%u), depth %u(%u)", 444 "entries %u, max %u(%u), depth %u(%u)",
431 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), 445 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
@@ -688,7 +702,12 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
688 } 702 }
689 eh = ext_block_hdr(bh); 703 eh = ext_block_hdr(bh);
690 ppos++; 704 ppos++;
691 BUG_ON(ppos > depth); 705 if (unlikely(ppos > depth)) {
706 put_bh(bh);
707 EXT4_ERROR_INODE(inode,
708 "ppos %d > depth %d", ppos, depth);
709 goto err;
710 }
692 path[ppos].p_bh = bh; 711 path[ppos].p_bh = bh;
693 path[ppos].p_hdr = eh; 712 path[ppos].p_hdr = eh;
694 i--; 713 i--;
@@ -734,7 +753,12 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
734 if (err) 753 if (err)
735 return err; 754 return err;
736 755
737 BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block)); 756 if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
757 EXT4_ERROR_INODE(inode,
758 "logical %d == ei_block %d!",
759 logical, le32_to_cpu(curp->p_idx->ei_block));
760 return -EIO;
761 }
738 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; 762 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
739 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 763 if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
740 /* insert after */ 764 /* insert after */
@@ -764,9 +788,17 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
764 ext4_idx_store_pblock(ix, ptr); 788 ext4_idx_store_pblock(ix, ptr);
765 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 789 le16_add_cpu(&curp->p_hdr->eh_entries, 1);
766 790
767 BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries) 791 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
768 > le16_to_cpu(curp->p_hdr->eh_max)); 792 > le16_to_cpu(curp->p_hdr->eh_max))) {
769 BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr)); 793 EXT4_ERROR_INODE(inode,
794 "logical %d == ei_block %d!",
795 logical, le32_to_cpu(curp->p_idx->ei_block));
796 return -EIO;
797 }
798 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
799 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
800 return -EIO;
801 }
770 802
771 err = ext4_ext_dirty(handle, inode, curp); 803 err = ext4_ext_dirty(handle, inode, curp);
772 ext4_std_error(inode->i_sb, err); 804 ext4_std_error(inode->i_sb, err);
@@ -804,7 +836,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
804 836
805 /* if current leaf will be split, then we should use 837 /* if current leaf will be split, then we should use
806 * border from split point */ 838 * border from split point */
807 BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr)); 839 if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
840 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
841 return -EIO;
842 }
808 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { 843 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
809 border = path[depth].p_ext[1].ee_block; 844 border = path[depth].p_ext[1].ee_block;
810 ext_debug("leaf will be split." 845 ext_debug("leaf will be split."
@@ -845,7 +880,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
845 880
846 /* initialize new leaf */ 881 /* initialize new leaf */
847 newblock = ablocks[--a]; 882 newblock = ablocks[--a];
848 BUG_ON(newblock == 0); 883 if (unlikely(newblock == 0)) {
884 EXT4_ERROR_INODE(inode, "newblock == 0!");
885 err = -EIO;
886 goto cleanup;
887 }
849 bh = sb_getblk(inode->i_sb, newblock); 888 bh = sb_getblk(inode->i_sb, newblock);
850 if (!bh) { 889 if (!bh) {
851 err = -EIO; 890 err = -EIO;
@@ -865,7 +904,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
865 ex = EXT_FIRST_EXTENT(neh); 904 ex = EXT_FIRST_EXTENT(neh);
866 905
867 /* move remainder of path[depth] to the new leaf */ 906 /* move remainder of path[depth] to the new leaf */
868 BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max); 907 if (unlikely(path[depth].p_hdr->eh_entries !=
908 path[depth].p_hdr->eh_max)) {
909 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
910 path[depth].p_hdr->eh_entries,
911 path[depth].p_hdr->eh_max);
912 err = -EIO;
913 goto cleanup;
914 }
869 /* start copy from next extent */ 915 /* start copy from next extent */
870 /* TODO: we could do it by single memmove */ 916 /* TODO: we could do it by single memmove */
871 m = 0; 917 m = 0;
@@ -912,7 +958,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
912 958
913 /* create intermediate indexes */ 959 /* create intermediate indexes */
914 k = depth - at - 1; 960 k = depth - at - 1;
915 BUG_ON(k < 0); 961 if (unlikely(k < 0)) {
962 EXT4_ERROR_INODE(inode, "k %d < 0!", k);
963 err = -EIO;
964 goto cleanup;
965 }
916 if (k) 966 if (k)
917 ext_debug("create %d intermediate indices\n", k); 967 ext_debug("create %d intermediate indices\n", k);
918 /* insert new index into current index block */ 968 /* insert new index into current index block */
@@ -949,8 +999,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
949 999
950 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 1000 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
951 EXT_MAX_INDEX(path[i].p_hdr)); 1001 EXT_MAX_INDEX(path[i].p_hdr));
952 BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) != 1002 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
953 EXT_LAST_INDEX(path[i].p_hdr)); 1003 EXT_LAST_INDEX(path[i].p_hdr))) {
1004 EXT4_ERROR_INODE(inode,
1005 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1006 le32_to_cpu(path[i].p_ext->ee_block));
1007 err = -EIO;
1008 goto cleanup;
1009 }
954 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 1010 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
955 ext_debug("%d: move %d:%llu in new index %llu\n", i, 1011 ext_debug("%d: move %d:%llu in new index %llu\n", i,
956 le32_to_cpu(path[i].p_idx->ei_block), 1012 le32_to_cpu(path[i].p_idx->ei_block),
@@ -1188,7 +1244,10 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1188 struct ext4_extent *ex; 1244 struct ext4_extent *ex;
1189 int depth, ee_len; 1245 int depth, ee_len;
1190 1246
1191 BUG_ON(path == NULL); 1247 if (unlikely(path == NULL)) {
1248 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1249 return -EIO;
1250 }
1192 depth = path->p_depth; 1251 depth = path->p_depth;
1193 *phys = 0; 1252 *phys = 0;
1194 1253
@@ -1202,15 +1261,33 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
1202 ex = path[depth].p_ext; 1261 ex = path[depth].p_ext;
1203 ee_len = ext4_ext_get_actual_len(ex); 1262 ee_len = ext4_ext_get_actual_len(ex);
1204 if (*logical < le32_to_cpu(ex->ee_block)) { 1263 if (*logical < le32_to_cpu(ex->ee_block)) {
1205 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); 1264 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1265 EXT4_ERROR_INODE(inode,
1266 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1267 *logical, le32_to_cpu(ex->ee_block));
1268 return -EIO;
1269 }
1206 while (--depth >= 0) { 1270 while (--depth >= 0) {
1207 ix = path[depth].p_idx; 1271 ix = path[depth].p_idx;
1208 BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); 1272 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1273 EXT4_ERROR_INODE(inode,
1274 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1275 ix != NULL ? ix->ei_block : 0,
1276 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1277 EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
1278 depth);
1279 return -EIO;
1280 }
1209 } 1281 }
1210 return 0; 1282 return 0;
1211 } 1283 }
1212 1284
1213 BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); 1285 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1286 EXT4_ERROR_INODE(inode,
1287 "logical %d < ee_block %d + ee_len %d!",
1288 *logical, le32_to_cpu(ex->ee_block), ee_len);
1289 return -EIO;
1290 }
1214 1291
1215 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1292 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1216 *phys = ext_pblock(ex) + ee_len - 1; 1293 *phys = ext_pblock(ex) + ee_len - 1;
@@ -1236,7 +1313,10 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1236 int depth; /* Note, NOT eh_depth; depth from top of tree */ 1313 int depth; /* Note, NOT eh_depth; depth from top of tree */
1237 int ee_len; 1314 int ee_len;
1238 1315
1239 BUG_ON(path == NULL); 1316 if (unlikely(path == NULL)) {
1317 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1318 return -EIO;
1319 }
1240 depth = path->p_depth; 1320 depth = path->p_depth;
1241 *phys = 0; 1321 *phys = 0;
1242 1322
@@ -1250,17 +1330,32 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1250 ex = path[depth].p_ext; 1330 ex = path[depth].p_ext;
1251 ee_len = ext4_ext_get_actual_len(ex); 1331 ee_len = ext4_ext_get_actual_len(ex);
1252 if (*logical < le32_to_cpu(ex->ee_block)) { 1332 if (*logical < le32_to_cpu(ex->ee_block)) {
1253 BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); 1333 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1334 EXT4_ERROR_INODE(inode,
1335 "first_extent(path[%d].p_hdr) != ex",
1336 depth);
1337 return -EIO;
1338 }
1254 while (--depth >= 0) { 1339 while (--depth >= 0) {
1255 ix = path[depth].p_idx; 1340 ix = path[depth].p_idx;
1256 BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); 1341 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1342 EXT4_ERROR_INODE(inode,
1343 "ix != EXT_FIRST_INDEX *logical %d!",
1344 *logical);
1345 return -EIO;
1346 }
1257 } 1347 }
1258 *logical = le32_to_cpu(ex->ee_block); 1348 *logical = le32_to_cpu(ex->ee_block);
1259 *phys = ext_pblock(ex); 1349 *phys = ext_pblock(ex);
1260 return 0; 1350 return 0;
1261 } 1351 }
1262 1352
1263 BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); 1353 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1354 EXT4_ERROR_INODE(inode,
1355 "logical %d < ee_block %d + ee_len %d!",
1356 *logical, le32_to_cpu(ex->ee_block), ee_len);
1357 return -EIO;
1358 }
1264 1359
1265 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { 1360 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1266 /* next allocated block in this leaf */ 1361 /* next allocated block in this leaf */
@@ -1399,8 +1494,12 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1399 1494
1400 eh = path[depth].p_hdr; 1495 eh = path[depth].p_hdr;
1401 ex = path[depth].p_ext; 1496 ex = path[depth].p_ext;
1402 BUG_ON(ex == NULL); 1497
1403 BUG_ON(eh == NULL); 1498 if (unlikely(ex == NULL || eh == NULL)) {
1499 EXT4_ERROR_INODE(inode,
1500 "ex %p == NULL or eh %p == NULL", ex, eh);
1501 return -EIO;
1502 }
1404 1503
1405 if (depth == 0) { 1504 if (depth == 0) {
1406 /* there is no tree at all */ 1505 /* there is no tree at all */
@@ -1523,8 +1622,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
1523 merge_done = 1; 1622 merge_done = 1;
1524 WARN_ON(eh->eh_entries == 0); 1623 WARN_ON(eh->eh_entries == 0);
1525 if (!eh->eh_entries) 1624 if (!eh->eh_entries)
1526 ext4_error(inode->i_sb, "ext4_ext_try_to_merge", 1625 ext4_error(inode->i_sb,
1527 "inode#%lu, eh->eh_entries = 0!", inode->i_ino); 1626 "inode#%lu, eh->eh_entries = 0!",
1627 inode->i_ino);
1528 } 1628 }
1529 1629
1530 return merge_done; 1630 return merge_done;
@@ -1597,13 +1697,19 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1597 ext4_lblk_t next; 1697 ext4_lblk_t next;
1598 unsigned uninitialized = 0; 1698 unsigned uninitialized = 0;
1599 1699
1600 BUG_ON(ext4_ext_get_actual_len(newext) == 0); 1700 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1701 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1702 return -EIO;
1703 }
1601 depth = ext_depth(inode); 1704 depth = ext_depth(inode);
1602 ex = path[depth].p_ext; 1705 ex = path[depth].p_ext;
1603 BUG_ON(path[depth].p_hdr == NULL); 1706 if (unlikely(path[depth].p_hdr == NULL)) {
1707 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1708 return -EIO;
1709 }
1604 1710
1605 /* try to insert block into found extent and return */ 1711 /* try to insert block into found extent and return */
1606 if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) 1712 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1607 && ext4_can_extents_be_merged(inode, ex, newext)) { 1713 && ext4_can_extents_be_merged(inode, ex, newext)) {
1608 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1714 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1609 ext4_ext_is_uninitialized(newext), 1715 ext4_ext_is_uninitialized(newext),
@@ -1724,7 +1830,7 @@ has_space:
1724 1830
1725merge: 1831merge:
1726 /* try to merge extents to the right */ 1832 /* try to merge extents to the right */
1727 if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) 1833 if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1728 ext4_ext_try_to_merge(inode, path, nearex); 1834 ext4_ext_try_to_merge(inode, path, nearex);
1729 1835
1730 /* try to merge extents to the left */ 1836 /* try to merge extents to the left */
@@ -1772,7 +1878,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1772 } 1878 }
1773 1879
1774 depth = ext_depth(inode); 1880 depth = ext_depth(inode);
1775 BUG_ON(path[depth].p_hdr == NULL); 1881 if (unlikely(path[depth].p_hdr == NULL)) {
1882 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1883 err = -EIO;
1884 break;
1885 }
1776 ex = path[depth].p_ext; 1886 ex = path[depth].p_ext;
1777 next = ext4_ext_next_allocated_block(path); 1887 next = ext4_ext_next_allocated_block(path);
1778 1888
@@ -1823,7 +1933,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1823 cbex.ec_type = EXT4_EXT_CACHE_EXTENT; 1933 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1824 } 1934 }
1825 1935
1826 BUG_ON(cbex.ec_len == 0); 1936 if (unlikely(cbex.ec_len == 0)) {
1937 EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
1938 err = -EIO;
1939 break;
1940 }
1827 err = func(inode, path, &cbex, ex, cbdata); 1941 err = func(inode, path, &cbex, ex, cbdata);
1828 ext4_ext_drop_refs(path); 1942 ext4_ext_drop_refs(path);
1829 1943
@@ -1937,7 +2051,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
1937 2051
1938 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && 2052 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1939 cex->ec_type != EXT4_EXT_CACHE_EXTENT); 2053 cex->ec_type != EXT4_EXT_CACHE_EXTENT);
1940 if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { 2054 if (in_range(block, cex->ec_block, cex->ec_len)) {
1941 ex->ee_block = cpu_to_le32(cex->ec_block); 2055 ex->ee_block = cpu_to_le32(cex->ec_block);
1942 ext4_ext_store_pblock(ex, cex->ec_start); 2056 ext4_ext_store_pblock(ex, cex->ec_start);
1943 ex->ee_len = cpu_to_le16(cex->ec_len); 2057 ex->ee_len = cpu_to_le16(cex->ec_len);
@@ -1966,7 +2080,10 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1966 /* free index block */ 2080 /* free index block */
1967 path--; 2081 path--;
1968 leaf = idx_pblock(path->p_idx); 2082 leaf = idx_pblock(path->p_idx);
1969 BUG_ON(path->p_hdr->eh_entries == 0); 2083 if (unlikely(path->p_hdr->eh_entries == 0)) {
2084 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2085 return -EIO;
2086 }
1970 err = ext4_ext_get_access(handle, inode, path); 2087 err = ext4_ext_get_access(handle, inode, path);
1971 if (err) 2088 if (err)
1972 return err; 2089 return err;
@@ -2104,8 +2221,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2104 if (!path[depth].p_hdr) 2221 if (!path[depth].p_hdr)
2105 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2222 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2106 eh = path[depth].p_hdr; 2223 eh = path[depth].p_hdr;
2107 BUG_ON(eh == NULL); 2224 if (unlikely(path[depth].p_hdr == NULL)) {
2108 2225 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2226 return -EIO;
2227 }
2109 /* find where to start removing */ 2228 /* find where to start removing */
2110 ex = EXT_LAST_EXTENT(eh); 2229 ex = EXT_LAST_EXTENT(eh);
2111 2230
@@ -2968,7 +3087,7 @@ fix_extent_len:
2968 ext4_ext_dirty(handle, inode, path + depth); 3087 ext4_ext_dirty(handle, inode, path + depth);
2969 return err; 3088 return err;
2970} 3089}
2971static int ext4_convert_unwritten_extents_dio(handle_t *handle, 3090static int ext4_convert_unwritten_extents_endio(handle_t *handle,
2972 struct inode *inode, 3091 struct inode *inode,
2973 struct ext4_ext_path *path) 3092 struct ext4_ext_path *path)
2974{ 3093{
@@ -3023,6 +3142,14 @@ out:
3023 return err; 3142 return err;
3024} 3143}
3025 3144
3145static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3146 sector_t block, int count)
3147{
3148 int i;
3149 for (i = 0; i < count; i++)
3150 unmap_underlying_metadata(bdev, block + i);
3151}
3152
3026static int 3153static int
3027ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3154ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3028 ext4_lblk_t iblock, unsigned int max_blocks, 3155 ext4_lblk_t iblock, unsigned int max_blocks,
@@ -3040,8 +3167,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3040 flags, allocated); 3167 flags, allocated);
3041 ext4_ext_show_leaf(inode, path); 3168 ext4_ext_show_leaf(inode, path);
3042 3169
3043 /* DIO get_block() before submit the IO, split the extent */ 3170 /* get_block() before submit the IO, split the extent */
3044 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { 3171 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3045 ret = ext4_split_unwritten_extents(handle, 3172 ret = ext4_split_unwritten_extents(handle,
3046 inode, path, iblock, 3173 inode, path, iblock,
3047 max_blocks, flags); 3174 max_blocks, flags);
@@ -3051,14 +3178,16 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3051 * completed 3178 * completed
3052 */ 3179 */
3053 if (io) 3180 if (io)
3054 io->flag = DIO_AIO_UNWRITTEN; 3181 io->flag = EXT4_IO_UNWRITTEN;
3055 else 3182 else
3056 EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN; 3183 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3184 if (ext4_should_dioread_nolock(inode))
3185 set_buffer_uninit(bh_result);
3057 goto out; 3186 goto out;
3058 } 3187 }
3059 /* async DIO end_io complete, convert the filled extent to written */ 3188 /* IO end_io complete, convert the filled extent to written */
3060 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { 3189 if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3061 ret = ext4_convert_unwritten_extents_dio(handle, inode, 3190 ret = ext4_convert_unwritten_extents_endio(handle, inode,
3062 path); 3191 path);
3063 if (ret >= 0) 3192 if (ret >= 0)
3064 ext4_update_inode_fsync_trans(handle, inode, 1); 3193 ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -3098,6 +3227,30 @@ out:
3098 } else 3227 } else
3099 allocated = ret; 3228 allocated = ret;
3100 set_buffer_new(bh_result); 3229 set_buffer_new(bh_result);
3230 /*
3231 * if we allocated more blocks than requested
3232 * we need to make sure we unmap the extra block
3233 * allocated. The actual needed block will get
3234 * unmapped later when we find the buffer_head marked
3235 * new.
3236 */
3237 if (allocated > max_blocks) {
3238 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3239 newblock + max_blocks,
3240 allocated - max_blocks);
3241 allocated = max_blocks;
3242 }
3243
3244 /*
3245 * If we have done fallocate with the offset that is already
3246 * delayed allocated, we would have block reservation
3247 * and quota reservation done in the delayed write path.
3248 * But fallocate would have already updated quota and block
3249 * count for this offset. So cancel these reservation
3250 */
3251 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
3252 ext4_da_update_reserve_space(inode, allocated, 0);
3253
3101map_out: 3254map_out:
3102 set_buffer_mapped(bh_result); 3255 set_buffer_mapped(bh_result);
3103out1: 3256out1:
@@ -3138,7 +3291,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3138{ 3291{
3139 struct ext4_ext_path *path = NULL; 3292 struct ext4_ext_path *path = NULL;
3140 struct ext4_extent_header *eh; 3293 struct ext4_extent_header *eh;
3141 struct ext4_extent newex, *ex; 3294 struct ext4_extent newex, *ex, *last_ex;
3142 ext4_fsblk_t newblock; 3295 ext4_fsblk_t newblock;
3143 int err = 0, depth, ret, cache_type; 3296 int err = 0, depth, ret, cache_type;
3144 unsigned int allocated = 0; 3297 unsigned int allocated = 0;
@@ -3190,7 +3343,13 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3190 * this situation is possible, though, _during_ tree modification; 3343 * this situation is possible, though, _during_ tree modification;
3191 * this is why assert can't be put in ext4_ext_find_extent() 3344 * this is why assert can't be put in ext4_ext_find_extent()
3192 */ 3345 */
3193 BUG_ON(path[depth].p_ext == NULL && depth != 0); 3346 if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3347 EXT4_ERROR_INODE(inode, "bad extent address "
3348 "iblock: %d, depth: %d pblock %lld",
3349 iblock, depth, path[depth].p_block);
3350 err = -EIO;
3351 goto out2;
3352 }
3194 eh = path[depth].p_hdr; 3353 eh = path[depth].p_hdr;
3195 3354
3196 ex = path[depth].p_ext; 3355 ex = path[depth].p_ext;
@@ -3205,7 +3364,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3205 */ 3364 */
3206 ee_len = ext4_ext_get_actual_len(ex); 3365 ee_len = ext4_ext_get_actual_len(ex);
3207 /* if found extent covers block, simply return it */ 3366 /* if found extent covers block, simply return it */
3208 if (iblock >= ee_block && iblock < ee_block + ee_len) { 3367 if (in_range(iblock, ee_block, ee_len)) {
3209 newblock = iblock - ee_block + ee_start; 3368 newblock = iblock - ee_block + ee_start;
3210 /* number of remaining blocks in the extent */ 3369 /* number of remaining blocks in the extent */
3211 allocated = ee_len - (iblock - ee_block); 3370 allocated = ee_len - (iblock - ee_block);
@@ -3297,21 +3456,35 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3297 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ 3456 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
3298 ext4_ext_mark_uninitialized(&newex); 3457 ext4_ext_mark_uninitialized(&newex);
3299 /* 3458 /*
3300 * io_end structure was created for every async 3459 * io_end structure was created for every IO write to an
3301 * direct IO write to the middle of the file. 3460 * uninitialized extent. To avoid unecessary conversion,
3302 * To avoid unecessary convertion for every aio dio rewrite 3461 * here we flag the IO that really needs the conversion.
3303 * to the mid of file, here we flag the IO that is really
3304 * need the convertion.
3305 * For non asycn direct IO case, flag the inode state 3462 * For non asycn direct IO case, flag the inode state
3306 * that we need to perform convertion when IO is done. 3463 * that we need to perform convertion when IO is done.
3307 */ 3464 */
3308 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { 3465 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3309 if (io) 3466 if (io)
3310 io->flag = DIO_AIO_UNWRITTEN; 3467 io->flag = EXT4_IO_UNWRITTEN;
3311 else 3468 else
3312 EXT4_I(inode)->i_state |= 3469 ext4_set_inode_state(inode,
3313 EXT4_STATE_DIO_UNWRITTEN;; 3470 EXT4_STATE_DIO_UNWRITTEN);
3314 } 3471 }
3472 if (ext4_should_dioread_nolock(inode))
3473 set_buffer_uninit(bh_result);
3474 }
3475
3476 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
3477 if (unlikely(!eh->eh_entries)) {
3478 EXT4_ERROR_INODE(inode,
3479 "eh->eh_entries == 0 ee_block %d",
3480 ex->ee_block);
3481 err = -EIO;
3482 goto out2;
3483 }
3484 last_ex = EXT_LAST_EXTENT(eh);
3485 if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
3486 + ext4_ext_get_actual_len(last_ex))
3487 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
3315 } 3488 }
3316 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3489 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3317 if (err) { 3490 if (err) {
@@ -3327,9 +3500,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3327 /* previous routine could use block we allocated */ 3500 /* previous routine could use block we allocated */
3328 newblock = ext_pblock(&newex); 3501 newblock = ext_pblock(&newex);
3329 allocated = ext4_ext_get_actual_len(&newex); 3502 allocated = ext4_ext_get_actual_len(&newex);
3503 if (allocated > max_blocks)
3504 allocated = max_blocks;
3330 set_buffer_new(bh_result); 3505 set_buffer_new(bh_result);
3331 3506
3332 /* 3507 /*
3508 * Update reserved blocks/metadata blocks after successful
3509 * block allocation which had been deferred till now.
3510 */
3511 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
3512 ext4_da_update_reserve_space(inode, allocated, 1);
3513
3514 /*
3333 * Cache the extent and update transaction to commit on fdatasync only 3515 * Cache the extent and update transaction to commit on fdatasync only
3334 * when it is _not_ an uninitialized extent. 3516 * when it is _not_ an uninitialized extent.
3335 */ 3517 */
@@ -3437,6 +3619,13 @@ static void ext4_falloc_update_inode(struct inode *inode,
3437 i_size_write(inode, new_size); 3619 i_size_write(inode, new_size);
3438 if (new_size > EXT4_I(inode)->i_disksize) 3620 if (new_size > EXT4_I(inode)->i_disksize)
3439 ext4_update_i_disksize(inode, new_size); 3621 ext4_update_i_disksize(inode, new_size);
3622 } else {
3623 /*
3624 * Mark that we allocate beyond EOF so the subsequent truncate
3625 * can proceed even if the new size is the same as i_size.
3626 */
3627 if (new_size > i_size_read(inode))
3628 EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
3440 } 3629 }
3441 3630
3442} 3631}
@@ -3541,7 +3730,7 @@ retry:
3541 * Returns 0 on success. 3730 * Returns 0 on success.
3542 */ 3731 */
3543int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 3732int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3544 loff_t len) 3733 ssize_t len)
3545{ 3734{
3546 handle_t *handle; 3735 handle_t *handle;
3547 ext4_lblk_t block; 3736 ext4_lblk_t block;
@@ -3573,7 +3762,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3573 map_bh.b_state = 0; 3762 map_bh.b_state = 0;
3574 ret = ext4_get_blocks(handle, inode, block, 3763 ret = ext4_get_blocks(handle, inode, block,
3575 max_blocks, &map_bh, 3764 max_blocks, &map_bh,
3576 EXT4_GET_BLOCKS_DIO_CONVERT_EXT); 3765 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
3577 if (ret <= 0) { 3766 if (ret <= 0) {
3578 WARN_ON(ret <= 0); 3767 WARN_ON(ret <= 0);
3579 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3768 printk(KERN_ERR "%s: ext4_ext_get_blocks "
@@ -3677,7 +3866,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
3677 int error = 0; 3866 int error = 0;
3678 3867
3679 /* in-inode? */ 3868 /* in-inode? */
3680 if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { 3869 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
3681 struct ext4_iloc iloc; 3870 struct ext4_iloc iloc;
3682 int offset; /* offset of xattr in inode */ 3871 int offset; /* offset of xattr in inode */
3683 3872
@@ -3690,6 +3879,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
3690 physical += offset; 3879 physical += offset;
3691 length = EXT4_SB(inode->i_sb)->s_inode_size - offset; 3880 length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
3692 flags |= FIEMAP_EXTENT_DATA_INLINE; 3881 flags |= FIEMAP_EXTENT_DATA_INLINE;
3882 brelse(iloc.bh);
3693 } else { /* external block */ 3883 } else { /* external block */
3694 physical = EXT4_I(inode)->i_file_acl << blockbits; 3884 physical = EXT4_I(inode)->i_file_acl << blockbits;
3695 length = inode->i_sb->s_blocksize; 3885 length = inode->i_sb->s_blocksize;
@@ -3705,7 +3895,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3705 __u64 start, __u64 len) 3895 __u64 start, __u64 len)
3706{ 3896{
3707 ext4_lblk_t start_blk; 3897 ext4_lblk_t start_blk;
3708 ext4_lblk_t len_blks;
3709 int error = 0; 3898 int error = 0;
3710 3899
3711 /* fallback to generic here if not in extents fmt */ 3900 /* fallback to generic here if not in extents fmt */
@@ -3719,8 +3908,14 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3719 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { 3908 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
3720 error = ext4_xattr_fiemap(inode, fieinfo); 3909 error = ext4_xattr_fiemap(inode, fieinfo);
3721 } else { 3910 } else {
3911 ext4_lblk_t len_blks;
3912 __u64 last_blk;
3913
3722 start_blk = start >> inode->i_sb->s_blocksize_bits; 3914 start_blk = start >> inode->i_sb->s_blocksize_bits;
3723 len_blks = len >> inode->i_sb->s_blocksize_bits; 3915 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
3916 if (last_blk >= EXT_MAX_BLOCK)
3917 last_blk = EXT_MAX_BLOCK-1;
3918 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
3724 3919
3725 /* 3920 /*
3726 * Walk the extent tree gathering extent information. 3921 * Walk the extent tree gathering extent information.
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 9630583cef28..d0776e410f34 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,6 +23,7 @@
23#include <linux/jbd2.h> 23#include <linux/jbd2.h>
24#include <linux/mount.h> 24#include <linux/mount.h>
25#include <linux/path.h> 25#include <linux/path.h>
26#include <linux/quotaops.h>
26#include "ext4.h" 27#include "ext4.h"
27#include "ext4_jbd2.h" 28#include "ext4_jbd2.h"
28#include "xattr.h" 29#include "xattr.h"
@@ -35,9 +36,9 @@
35 */ 36 */
36static int ext4_release_file(struct inode *inode, struct file *filp) 37static int ext4_release_file(struct inode *inode, struct file *filp)
37{ 38{
38 if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) { 39 if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
39 ext4_alloc_da_blocks(inode); 40 ext4_alloc_da_blocks(inode);
40 EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE; 41 ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
41 } 42 }
42 /* if we are the last writer on the inode, drop the block reservation */ 43 /* if we are the last writer on the inode, drop the block reservation */
43 if ((filp->f_mode & FMODE_WRITE) && 44 if ((filp->f_mode & FMODE_WRITE) &&
@@ -116,18 +117,16 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
116 * devices or filesystem images. 117 * devices or filesystem images.
117 */ 118 */
118 memset(buf, 0, sizeof(buf)); 119 memset(buf, 0, sizeof(buf));
119 path.mnt = mnt->mnt_parent; 120 path.mnt = mnt;
120 path.dentry = mnt->mnt_mountpoint; 121 path.dentry = mnt->mnt_root;
121 path_get(&path);
122 cp = d_path(&path, buf, sizeof(buf)); 122 cp = d_path(&path, buf, sizeof(buf));
123 path_put(&path);
124 if (!IS_ERR(cp)) { 123 if (!IS_ERR(cp)) {
125 memcpy(sbi->s_es->s_last_mounted, cp, 124 memcpy(sbi->s_es->s_last_mounted, cp,
126 sizeof(sbi->s_es->s_last_mounted)); 125 sizeof(sbi->s_es->s_last_mounted));
127 sb->s_dirt = 1; 126 sb->s_dirt = 1;
128 } 127 }
129 } 128 }
130 return generic_file_open(inode, filp); 129 return dquot_file_open(inode, filp);
131} 130}
132 131
133const struct file_operations ext4_file_operations = { 132const struct file_operations ext4_file_operations = {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0b22497d92e1..0d0c3239c1cd 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -63,7 +63,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
63 if (inode->i_sb->s_flags & MS_RDONLY) 63 if (inode->i_sb->s_flags & MS_RDONLY)
64 return 0; 64 return 0;
65 65
66 ret = flush_aio_dio_completed_IO(inode); 66 ret = flush_completed_IO(inode);
67 if (ret < 0) 67 if (ret < 0)
68 return ret; 68 return ret;
69 69
@@ -88,9 +88,21 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
88 return ext4_force_commit(inode->i_sb); 88 return ext4_force_commit(inode->i_sb);
89 89
90 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; 90 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
91 if (jbd2_log_start_commit(journal, commit_tid)) 91 if (jbd2_log_start_commit(journal, commit_tid)) {
92 /*
93 * When the journal is on a different device than the
94 * fs data disk, we need to issue the barrier in
95 * writeback mode. (In ordered mode, the jbd2 layer
96 * will take care of issuing the barrier. In
97 * data=journal, all of the data blocks are written to
98 * the journal device.)
99 */
100 if (ext4_should_writeback_data(inode) &&
101 (journal->j_fs_dev != journal->j_dev) &&
102 (journal->j_flags & JBD2_BARRIER))
103 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
92 jbd2_log_wait_commit(journal, commit_tid); 104 jbd2_log_wait_commit(journal, commit_tid);
93 else if (journal->j_flags & JBD2_BARRIER) 105 } else if (journal->j_flags & JBD2_BARRIER)
94 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 106 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
95 return ret; 107 return ret;
96} 108}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f3624ead4f6c..57f6eef6ccd6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -76,8 +76,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
76 /* If checksum is bad mark all blocks and inodes use to prevent 76 /* If checksum is bad mark all blocks and inodes use to prevent
77 * allocation, essentially implementing a per-group read-only flag. */ 77 * allocation, essentially implementing a per-group read-only flag. */
78 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 78 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
79 ext4_error(sb, __func__, "Checksum bad for group %u", 79 ext4_error(sb, "Checksum bad for group %u", block_group);
80 block_group);
81 ext4_free_blks_set(sb, gdp, 0); 80 ext4_free_blks_set(sb, gdp, 0);
82 ext4_free_inodes_set(sb, gdp, 0); 81 ext4_free_inodes_set(sb, gdp, 0);
83 ext4_itable_unused_set(sb, gdp, 0); 82 ext4_itable_unused_set(sb, gdp, 0);
@@ -111,8 +110,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
111 bitmap_blk = ext4_inode_bitmap(sb, desc); 110 bitmap_blk = ext4_inode_bitmap(sb, desc);
112 bh = sb_getblk(sb, bitmap_blk); 111 bh = sb_getblk(sb, bitmap_blk);
113 if (unlikely(!bh)) { 112 if (unlikely(!bh)) {
114 ext4_error(sb, __func__, 113 ext4_error(sb, "Cannot read inode bitmap - "
115 "Cannot read inode bitmap - "
116 "block_group = %u, inode_bitmap = %llu", 114 "block_group = %u, inode_bitmap = %llu",
117 block_group, bitmap_blk); 115 block_group, bitmap_blk);
118 return NULL; 116 return NULL;
@@ -153,8 +151,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
153 set_bitmap_uptodate(bh); 151 set_bitmap_uptodate(bh);
154 if (bh_submit_read(bh) < 0) { 152 if (bh_submit_read(bh) < 0) {
155 put_bh(bh); 153 put_bh(bh);
156 ext4_error(sb, __func__, 154 ext4_error(sb, "Cannot read inode bitmap - "
157 "Cannot read inode bitmap - "
158 "block_group = %u, inode_bitmap = %llu", 155 "block_group = %u, inode_bitmap = %llu",
159 block_group, bitmap_blk); 156 block_group, bitmap_blk);
160 return NULL; 157 return NULL;
@@ -217,10 +214,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
217 * Note: we must free any quota before locking the superblock, 214 * Note: we must free any quota before locking the superblock,
218 * as writing the quota to disk may need the lock as well. 215 * as writing the quota to disk may need the lock as well.
219 */ 216 */
220 vfs_dq_init(inode); 217 dquot_initialize(inode);
221 ext4_xattr_delete_inode(handle, inode); 218 ext4_xattr_delete_inode(handle, inode);
222 vfs_dq_free_inode(inode); 219 dquot_free_inode(inode);
223 vfs_dq_drop(inode); 220 dquot_drop(inode);
224 221
225 is_directory = S_ISDIR(inode->i_mode); 222 is_directory = S_ISDIR(inode->i_mode);
226 223
@@ -229,8 +226,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
229 226
230 es = EXT4_SB(sb)->s_es; 227 es = EXT4_SB(sb)->s_es;
231 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { 228 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
232 ext4_error(sb, "ext4_free_inode", 229 ext4_error(sb, "reserved or nonexistent inode %lu", ino);
233 "reserved or nonexistent inode %lu", ino);
234 goto error_return; 230 goto error_return;
235 } 231 }
236 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); 232 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
@@ -248,8 +244,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
248 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), 244 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
249 bit, bitmap_bh->b_data); 245 bit, bitmap_bh->b_data);
250 if (!cleared) 246 if (!cleared)
251 ext4_error(sb, "ext4_free_inode", 247 ext4_error(sb, "bit already cleared for inode %lu", ino);
252 "bit already cleared for inode %lu", ino);
253 else { 248 else {
254 gdp = ext4_get_group_desc(sb, block_group, &bh2); 249 gdp = ext4_get_group_desc(sb, block_group, &bh2);
255 250
@@ -268,7 +263,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
268 ext4_group_t f; 263 ext4_group_t f;
269 264
270 f = ext4_flex_group(sbi, block_group); 265 f = ext4_flex_group(sbi, block_group);
271 atomic_dec(&sbi->s_flex_groups[f].free_inodes); 266 atomic_dec(&sbi->s_flex_groups[f].used_dirs);
272 } 267 }
273 268
274 } 269 }
@@ -736,8 +731,7 @@ static int ext4_claim_inode(struct super_block *sb,
736 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || 731 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
737 ino > EXT4_INODES_PER_GROUP(sb)) { 732 ino > EXT4_INODES_PER_GROUP(sb)) {
738 ext4_unlock_group(sb, group); 733 ext4_unlock_group(sb, group);
739 ext4_error(sb, __func__, 734 ext4_error(sb, "reserved inode or inode > inodes count - "
740 "reserved inode or inode > inodes count - "
741 "block_group = %u, inode=%lu", group, 735 "block_group = %u, inode=%lu", group,
742 ino + group * EXT4_INODES_PER_GROUP(sb)); 736 ino + group * EXT4_INODES_PER_GROUP(sb));
743 return 1; 737 return 1;
@@ -779,7 +773,7 @@ static int ext4_claim_inode(struct super_block *sb,
779 if (sbi->s_log_groups_per_flex) { 773 if (sbi->s_log_groups_per_flex) {
780 ext4_group_t f = ext4_flex_group(sbi, group); 774 ext4_group_t f = ext4_flex_group(sbi, group);
781 775
782 atomic_inc(&sbi->s_flex_groups[f].free_inodes); 776 atomic_inc(&sbi->s_flex_groups[f].used_dirs);
783 } 777 }
784 } 778 }
785 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 779 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
@@ -904,7 +898,7 @@ repeat_in_this_group:
904 BUFFER_TRACE(inode_bitmap_bh, 898 BUFFER_TRACE(inode_bitmap_bh,
905 "call ext4_handle_dirty_metadata"); 899 "call ext4_handle_dirty_metadata");
906 err = ext4_handle_dirty_metadata(handle, 900 err = ext4_handle_dirty_metadata(handle,
907 inode, 901 NULL,
908 inode_bitmap_bh); 902 inode_bitmap_bh);
909 if (err) 903 if (err)
910 goto fail; 904 goto fail;
@@ -1029,15 +1023,16 @@ got:
1029 inode->i_generation = sbi->s_next_generation++; 1023 inode->i_generation = sbi->s_next_generation++;
1030 spin_unlock(&sbi->s_next_gen_lock); 1024 spin_unlock(&sbi->s_next_gen_lock);
1031 1025
1032 ei->i_state = EXT4_STATE_NEW; 1026 ei->i_state_flags = 0;
1027 ext4_set_inode_state(inode, EXT4_STATE_NEW);
1033 1028
1034 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 1029 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
1035 1030
1036 ret = inode; 1031 ret = inode;
1037 if (vfs_dq_alloc_inode(inode)) { 1032 dquot_initialize(inode);
1038 err = -EDQUOT; 1033 err = dquot_alloc_inode(inode);
1034 if (err)
1039 goto fail_drop; 1035 goto fail_drop;
1040 }
1041 1036
1042 err = ext4_init_acl(handle, inode, dir); 1037 err = ext4_init_acl(handle, inode, dir);
1043 if (err) 1038 if (err)
@@ -1074,10 +1069,10 @@ really_out:
1074 return ret; 1069 return ret;
1075 1070
1076fail_free_drop: 1071fail_free_drop:
1077 vfs_dq_free_inode(inode); 1072 dquot_free_inode(inode);
1078 1073
1079fail_drop: 1074fail_drop:
1080 vfs_dq_drop(inode); 1075 dquot_drop(inode);
1081 inode->i_flags |= S_NOQUOTA; 1076 inode->i_flags |= S_NOQUOTA;
1082 inode->i_nlink = 0; 1077 inode->i_nlink = 0;
1083 unlock_new_inode(inode); 1078 unlock_new_inode(inode);
@@ -1098,8 +1093,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
1098 1093
1099 /* Error cases - e2fsck has already cleaned up for us */ 1094 /* Error cases - e2fsck has already cleaned up for us */
1100 if (ino > max_ino) { 1095 if (ino > max_ino) {
1101 ext4_warning(sb, __func__, 1096 ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino);
1102 "bad orphan ino %lu! e2fsck was run?", ino);
1103 goto error; 1097 goto error;
1104 } 1098 }
1105 1099
@@ -1107,8 +1101,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
1107 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); 1101 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
1108 bitmap_bh = ext4_read_inode_bitmap(sb, block_group); 1102 bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
1109 if (!bitmap_bh) { 1103 if (!bitmap_bh) {
1110 ext4_warning(sb, __func__, 1104 ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
1111 "inode bitmap error for orphan %lu", ino);
1112 goto error; 1105 goto error;
1113 } 1106 }
1114 1107
@@ -1140,8 +1133,7 @@ iget_failed:
1140 err = PTR_ERR(inode); 1133 err = PTR_ERR(inode);
1141 inode = NULL; 1134 inode = NULL;
1142bad_orphan: 1135bad_orphan:
1143 ext4_warning(sb, __func__, 1136 ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino);
1144 "bad orphan inode %lu! e2fsck was run?", ino);
1145 printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", 1137 printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
1146 bit, (unsigned long long)bitmap_bh->b_blocknr, 1138 bit, (unsigned long long)bitmap_bh->b_blocknr,
1147 ext4_test_bit(bit, bitmap_bh->b_data)); 1139 ext4_test_bit(bit, bitmap_bh->b_data));
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ab807963a614..81d605412844 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -38,6 +38,8 @@
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h> 40#include <linux/workqueue.h>
41#include <linux/kernel.h>
42#include <linux/slab.h>
41 43
42#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
43#include "xattr.h" 45#include "xattr.h"
@@ -170,6 +172,9 @@ void ext4_delete_inode(struct inode *inode)
170 handle_t *handle; 172 handle_t *handle;
171 int err; 173 int err;
172 174
175 if (!is_bad_inode(inode))
176 dquot_initialize(inode);
177
173 if (ext4_should_order_data(inode)) 178 if (ext4_should_order_data(inode))
174 ext4_begin_ordered_truncate(inode, 0); 179 ext4_begin_ordered_truncate(inode, 0);
175 truncate_inode_pages(&inode->i_data, 0); 180 truncate_inode_pages(&inode->i_data, 0);
@@ -194,7 +199,7 @@ void ext4_delete_inode(struct inode *inode)
194 inode->i_size = 0; 199 inode->i_size = 0;
195 err = ext4_mark_inode_dirty(handle, inode); 200 err = ext4_mark_inode_dirty(handle, inode);
196 if (err) { 201 if (err) {
197 ext4_warning(inode->i_sb, __func__, 202 ext4_warning(inode->i_sb,
198 "couldn't mark inode dirty (err %d)", err); 203 "couldn't mark inode dirty (err %d)", err);
199 goto stop_handle; 204 goto stop_handle;
200 } 205 }
@@ -212,7 +217,7 @@ void ext4_delete_inode(struct inode *inode)
212 if (err > 0) 217 if (err > 0)
213 err = ext4_journal_restart(handle, 3); 218 err = ext4_journal_restart(handle, 3);
214 if (err != 0) { 219 if (err != 0) {
215 ext4_warning(inode->i_sb, __func__, 220 ext4_warning(inode->i_sb,
216 "couldn't extend journal (err %d)", err); 221 "couldn't extend journal (err %d)", err);
217 stop_handle: 222 stop_handle:
218 ext4_journal_stop(handle); 223 ext4_journal_stop(handle);
@@ -323,8 +328,7 @@ static int ext4_block_to_path(struct inode *inode,
323 offsets[n++] = i_block & (ptrs - 1); 328 offsets[n++] = i_block & (ptrs - 1);
324 final = ptrs; 329 final = ptrs;
325 } else { 330 } else {
326 ext4_warning(inode->i_sb, "ext4_block_to_path", 331 ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
327 "block %lu > max in inode %lu",
328 i_block + direct_blocks + 332 i_block + direct_blocks +
329 indirect_blocks + double_blocks, inode->i_ino); 333 indirect_blocks + double_blocks, inode->i_ino);
330 } 334 }
@@ -344,7 +348,7 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
344 if (blk && 348 if (blk &&
345 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 349 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
346 blk, 1))) { 350 blk, 1))) {
347 ext4_error(inode->i_sb, function, 351 __ext4_error(inode->i_sb, function,
348 "invalid block reference %u " 352 "invalid block reference %u "
349 "in inode #%lu", blk, inode->i_ino); 353 "in inode #%lu", blk, inode->i_ino);
350 return -EIO; 354 return -EIO;
@@ -607,7 +611,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
607 if (*err) 611 if (*err)
608 goto failed_out; 612 goto failed_out;
609 613
610 BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); 614 if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
615 EXT4_ERROR_INODE(inode,
616 "current_block %llu + count %lu > %d!",
617 current_block, count,
618 EXT4_MAX_BLOCK_FILE_PHYS);
619 *err = -EIO;
620 goto failed_out;
621 }
611 622
612 target -= count; 623 target -= count;
613 /* allocate blocks for indirect blocks */ 624 /* allocate blocks for indirect blocks */
@@ -643,7 +654,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
643 ar.flags = EXT4_MB_HINT_DATA; 654 ar.flags = EXT4_MB_HINT_DATA;
644 655
645 current_block = ext4_mb_new_blocks(handle, &ar, err); 656 current_block = ext4_mb_new_blocks(handle, &ar, err);
646 BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); 657 if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
658 EXT4_ERROR_INODE(inode,
659 "current_block %llu + ar.len %d > %d!",
660 current_block, ar.len,
661 EXT4_MAX_BLOCK_FILE_PHYS);
662 *err = -EIO;
663 goto failed_out;
664 }
647 665
648 if (*err && (target == blks)) { 666 if (*err && (target == blks)) {
649 /* 667 /*
@@ -1009,86 +1027,115 @@ qsize_t *ext4_get_reserved_space(struct inode *inode)
1009 return &EXT4_I(inode)->i_reserved_quota; 1027 return &EXT4_I(inode)->i_reserved_quota;
1010} 1028}
1011#endif 1029#endif
1030
1012/* 1031/*
1013 * Calculate the number of metadata blocks need to reserve 1032 * Calculate the number of metadata blocks need to reserve
1014 * to allocate @blocks for non extent file based file 1033 * to allocate a new block at @lblocks for non extent file based file
1015 */ 1034 */
1016static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) 1035static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1036 sector_t lblock)
1017{ 1037{
1018 int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); 1038 struct ext4_inode_info *ei = EXT4_I(inode);
1019 int ind_blks, dind_blks, tind_blks; 1039 sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
1020 1040 int blk_bits;
1021 /* number of new indirect blocks needed */
1022 ind_blks = (blocks + icap - 1) / icap;
1023 1041
1024 dind_blks = (ind_blks + icap - 1) / icap; 1042 if (lblock < EXT4_NDIR_BLOCKS)
1043 return 0;
1025 1044
1026 tind_blks = 1; 1045 lblock -= EXT4_NDIR_BLOCKS;
1027 1046
1028 return ind_blks + dind_blks + tind_blks; 1047 if (ei->i_da_metadata_calc_len &&
1048 (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
1049 ei->i_da_metadata_calc_len++;
1050 return 0;
1051 }
1052 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
1053 ei->i_da_metadata_calc_len = 1;
1054 blk_bits = order_base_2(lblock);
1055 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
1029} 1056}
1030 1057
1031/* 1058/*
1032 * Calculate the number of metadata blocks need to reserve 1059 * Calculate the number of metadata blocks need to reserve
1033 * to allocate given number of blocks 1060 * to allocate a block located at @lblock
1034 */ 1061 */
1035static int ext4_calc_metadata_amount(struct inode *inode, int blocks) 1062static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
1036{ 1063{
1037 if (!blocks)
1038 return 0;
1039
1040 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 1064 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
1041 return ext4_ext_calc_metadata_amount(inode, blocks); 1065 return ext4_ext_calc_metadata_amount(inode, lblock);
1042 1066
1043 return ext4_indirect_calc_metadata_amount(inode, blocks); 1067 return ext4_indirect_calc_metadata_amount(inode, lblock);
1044} 1068}
1045 1069
1046static void ext4_da_update_reserve_space(struct inode *inode, int used) 1070/*
1071 * Called with i_data_sem down, which is important since we can call
1072 * ext4_discard_preallocations() from here.
1073 */
1074void ext4_da_update_reserve_space(struct inode *inode,
1075 int used, int quota_claim)
1047{ 1076{
1048 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1077 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1049 int total, mdb, mdb_free, mdb_claim = 0; 1078 struct ext4_inode_info *ei = EXT4_I(inode);
1050 1079 int mdb_free = 0, allocated_meta_blocks = 0;
1051 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1080
1052 /* recalculate the number of metablocks still need to be reserved */ 1081 spin_lock(&ei->i_block_reservation_lock);
1053 total = EXT4_I(inode)->i_reserved_data_blocks - used; 1082 trace_ext4_da_update_reserve_space(inode, used);
1054 mdb = ext4_calc_metadata_amount(inode, total); 1083 if (unlikely(used > ei->i_reserved_data_blocks)) {
1055 1084 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
1056 /* figure out how many metablocks to release */ 1085 "with only %d reserved data blocks\n",
1057 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1086 __func__, inode->i_ino, used,
1058 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1087 ei->i_reserved_data_blocks);
1059 1088 WARN_ON(1);
1060 if (mdb_free) { 1089 used = ei->i_reserved_data_blocks;
1061 /* Account for allocated meta_blocks */ 1090 }
1062 mdb_claim = EXT4_I(inode)->i_allocated_meta_blocks; 1091
1063 BUG_ON(mdb_free < mdb_claim); 1092 /* Update per-inode reservations */
1064 mdb_free -= mdb_claim; 1093 ei->i_reserved_data_blocks -= used;
1065 1094 used += ei->i_allocated_meta_blocks;
1066 /* update fs dirty blocks counter */ 1095 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
1096 allocated_meta_blocks = ei->i_allocated_meta_blocks;
1097 ei->i_allocated_meta_blocks = 0;
1098 percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
1099
1100 if (ei->i_reserved_data_blocks == 0) {
1101 /*
1102 * We can release all of the reserved metadata blocks
1103 * only when we have written all of the delayed
1104 * allocation blocks.
1105 */
1106 mdb_free = ei->i_reserved_meta_blocks;
1107 ei->i_reserved_meta_blocks = 0;
1108 ei->i_da_metadata_calc_len = 0;
1067 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); 1109 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
1068 EXT4_I(inode)->i_allocated_meta_blocks = 0;
1069 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1070 } 1110 }
1071
1072 /* update per-inode reservations */
1073 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
1074 EXT4_I(inode)->i_reserved_data_blocks -= used;
1075 percpu_counter_sub(&sbi->s_dirtyblocks_counter, used + mdb_claim);
1076 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1111 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1077 1112
1078 vfs_dq_claim_block(inode, used + mdb_claim); 1113 /* Update quota subsystem */
1079 1114 if (quota_claim) {
1080 /* 1115 dquot_claim_block(inode, used);
1081 * free those over-booking quota for metadata blocks 1116 if (mdb_free)
1082 */ 1117 dquot_release_reservation_block(inode, mdb_free);
1083 if (mdb_free) 1118 } else {
1084 vfs_dq_release_reservation_block(inode, mdb_free); 1119 /*
1120 * We did fallocate with an offset that is already delayed
1121 * allocated. So on delayed allocated writeback we should
1122 * not update the quota for allocated blocks. But then
1123 * converting an fallocate region to initialized region would
1124 * have caused a metadata allocation. So claim quota for
1125 * that
1126 */
1127 if (allocated_meta_blocks)
1128 dquot_claim_block(inode, allocated_meta_blocks);
1129 dquot_release_reservation_block(inode, mdb_free + used);
1130 }
1085 1131
1086 /* 1132 /*
1087 * If we have done all the pending block allocations and if 1133 * If we have done all the pending block allocations and if
1088 * there aren't any writers on the inode, we can discard the 1134 * there aren't any writers on the inode, we can discard the
1089 * inode's preallocations. 1135 * inode's preallocations.
1090 */ 1136 */
1091 if (!total && (atomic_read(&inode->i_writecount) == 0)) 1137 if ((ei->i_reserved_data_blocks == 0) &&
1138 (atomic_read(&inode->i_writecount) == 0))
1092 ext4_discard_preallocations(inode); 1139 ext4_discard_preallocations(inode);
1093} 1140}
1094 1141
@@ -1096,7 +1143,7 @@ static int check_block_validity(struct inode *inode, const char *msg,
1096 sector_t logical, sector_t phys, int len) 1143 sector_t logical, sector_t phys, int len)
1097{ 1144{
1098 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1145 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
1099 ext4_error(inode->i_sb, msg, 1146 __ext4_error(inode->i_sb, msg,
1100 "inode #%lu logical block %llu mapped to %llu " 1147 "inode #%lu logical block %llu mapped to %llu "
1101 "(size %d)", inode->i_ino, 1148 "(size %d)", inode->i_ino,
1102 (unsigned long long) logical, 1149 (unsigned long long) logical,
@@ -1278,20 +1325,22 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1278 * i_data's format changing. Force the migrate 1325 * i_data's format changing. Force the migrate
1279 * to fail by clearing migrate flags 1326 * to fail by clearing migrate flags
1280 */ 1327 */
1281 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; 1328 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
1282 } 1329 }
1283 }
1284 1330
1331 /*
1332 * Update reserved blocks/metadata blocks after successful
1333 * block allocation which had been deferred till now. We don't
1334 * support fallocate for non extent files. So we can update
1335 * reserve space here.
1336 */
1337 if ((retval > 0) &&
1338 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
1339 ext4_da_update_reserve_space(inode, retval, 1);
1340 }
1285 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1341 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1286 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1342 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1287 1343
1288 /*
1289 * Update reserved blocks/metadata blocks after successful
1290 * block allocation which had been deferred till now.
1291 */
1292 if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
1293 ext4_da_update_reserve_space(inode, retval);
1294
1295 up_write((&EXT4_I(inode)->i_data_sem)); 1344 up_write((&EXT4_I(inode)->i_data_sem));
1296 if (retval > 0 && buffer_mapped(bh)) { 1345 if (retval > 0 && buffer_mapped(bh)) {
1297 int ret = check_block_validity(inode, "file system " 1346 int ret = check_block_validity(inode, "file system "
@@ -1504,6 +1553,8 @@ static void ext4_truncate_failed_write(struct inode *inode)
1504 ext4_truncate(inode); 1553 ext4_truncate(inode);
1505} 1554}
1506 1555
1556static int ext4_get_block_write(struct inode *inode, sector_t iblock,
1557 struct buffer_head *bh_result, int create);
1507static int ext4_write_begin(struct file *file, struct address_space *mapping, 1558static int ext4_write_begin(struct file *file, struct address_space *mapping,
1508 loff_t pos, unsigned len, unsigned flags, 1559 loff_t pos, unsigned len, unsigned flags,
1509 struct page **pagep, void **fsdata) 1560 struct page **pagep, void **fsdata)
@@ -1545,8 +1596,12 @@ retry:
1545 } 1596 }
1546 *pagep = page; 1597 *pagep = page;
1547 1598
1548 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1599 if (ext4_should_dioread_nolock(inode))
1549 ext4_get_block); 1600 ret = block_write_begin(file, mapping, pos, len, flags, pagep,
1601 fsdata, ext4_get_block_write);
1602 else
1603 ret = block_write_begin(file, mapping, pos, len, flags, pagep,
1604 fsdata, ext4_get_block);
1550 1605
1551 if (!ret && ext4_should_journal_data(inode)) { 1606 if (!ret && ext4_should_journal_data(inode)) {
1552 ret = walk_page_buffers(handle, page_buffers(page), 1607 ret = walk_page_buffers(handle, page_buffers(page),
@@ -1763,7 +1818,7 @@ static int ext4_journalled_write_end(struct file *file,
1763 new_i_size = pos + copied; 1818 new_i_size = pos + copied;
1764 if (new_i_size > inode->i_size) 1819 if (new_i_size > inode->i_size)
1765 i_size_write(inode, pos+copied); 1820 i_size_write(inode, pos+copied);
1766 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1821 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1767 if (new_i_size > EXT4_I(inode)->i_disksize) { 1822 if (new_i_size > EXT4_I(inode)->i_disksize) {
1768 ext4_update_i_disksize(inode, new_i_size); 1823 ext4_update_i_disksize(inode, new_i_size);
1769 ret2 = ext4_mark_inode_dirty(handle, inode); 1824 ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1797,11 +1852,16 @@ static int ext4_journalled_write_end(struct file *file,
1797 return ret ? ret : copied; 1852 return ret ? ret : copied;
1798} 1853}
1799 1854
1800static int ext4_da_reserve_space(struct inode *inode, int nrblocks) 1855/*
1856 * Reserve a single block located at lblock
1857 */
1858static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1801{ 1859{
1802 int retries = 0; 1860 int retries = 0;
1803 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1861 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1804 unsigned long md_needed, mdblocks, total = 0; 1862 struct ext4_inode_info *ei = EXT4_I(inode);
1863 unsigned long md_needed, md_reserved;
1864 int ret;
1805 1865
1806 /* 1866 /*
1807 * recalculate the amount of metadata blocks to reserve 1867 * recalculate the amount of metadata blocks to reserve
@@ -1809,35 +1869,33 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1809 * worse case is one extent per block 1869 * worse case is one extent per block
1810 */ 1870 */
1811repeat: 1871repeat:
1812 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1872 spin_lock(&ei->i_block_reservation_lock);
1813 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; 1873 md_reserved = ei->i_reserved_meta_blocks;
1814 mdblocks = ext4_calc_metadata_amount(inode, total); 1874 md_needed = ext4_calc_metadata_amount(inode, lblock);
1815 BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); 1875 trace_ext4_da_reserve_space(inode, md_needed);
1816 1876 spin_unlock(&ei->i_block_reservation_lock);
1817 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
1818 total = md_needed + nrblocks;
1819 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1820 1877
1821 /* 1878 /*
1822 * Make quota reservation here to prevent quota overflow 1879 * Make quota reservation here to prevent quota overflow
1823 * later. Real quota accounting is done at pages writeout 1880 * later. Real quota accounting is done at pages writeout
1824 * time. 1881 * time.
1825 */ 1882 */
1826 if (vfs_dq_reserve_block(inode, total)) 1883 ret = dquot_reserve_block(inode, md_needed + 1);
1827 return -EDQUOT; 1884 if (ret)
1885 return ret;
1828 1886
1829 if (ext4_claim_free_blocks(sbi, total)) { 1887 if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
1830 vfs_dq_release_reservation_block(inode, total); 1888 dquot_release_reservation_block(inode, md_needed + 1);
1831 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1889 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1832 yield(); 1890 yield();
1833 goto repeat; 1891 goto repeat;
1834 } 1892 }
1835 return -ENOSPC; 1893 return -ENOSPC;
1836 } 1894 }
1837 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1895 spin_lock(&ei->i_block_reservation_lock);
1838 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1896 ei->i_reserved_data_blocks++;
1839 EXT4_I(inode)->i_reserved_meta_blocks += md_needed; 1897 ei->i_reserved_meta_blocks += md_needed;
1840 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1898 spin_unlock(&ei->i_block_reservation_lock);
1841 1899
1842 return 0; /* success */ 1900 return 0; /* success */
1843} 1901}
@@ -1845,49 +1903,46 @@ repeat:
1845static void ext4_da_release_space(struct inode *inode, int to_free) 1903static void ext4_da_release_space(struct inode *inode, int to_free)
1846{ 1904{
1847 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1905 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1848 int total, mdb, mdb_free, release; 1906 struct ext4_inode_info *ei = EXT4_I(inode);
1849 1907
1850 if (!to_free) 1908 if (!to_free)
1851 return; /* Nothing to release, exit */ 1909 return; /* Nothing to release, exit */
1852 1910
1853 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1911 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1854 1912
1855 if (!EXT4_I(inode)->i_reserved_data_blocks) { 1913 if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1856 /* 1914 /*
1857 * if there is no reserved blocks, but we try to free some 1915 * if there aren't enough reserved blocks, then the
1858 * then the counter is messed up somewhere. 1916 * counter is messed up somewhere. Since this
1859 * but since this function is called from invalidate 1917 * function is called from invalidate page, it's
1860 * page, it's harmless to return without any action 1918 * harmless to return without any action.
1861 */ 1919 */
1862 printk(KERN_INFO "ext4 delalloc try to release %d reserved " 1920 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
1863 "blocks for inode %lu, but there is no reserved " 1921 "ino %lu, to_free %d with only %d reserved "
1864 "data blocks\n", to_free, inode->i_ino); 1922 "data blocks\n", inode->i_ino, to_free,
1865 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1923 ei->i_reserved_data_blocks);
1866 return; 1924 WARN_ON(1);
1925 to_free = ei->i_reserved_data_blocks;
1867 } 1926 }
1927 ei->i_reserved_data_blocks -= to_free;
1868 1928
1869 /* recalculate the number of metablocks still need to be reserved */ 1929 if (ei->i_reserved_data_blocks == 0) {
1870 total = EXT4_I(inode)->i_reserved_data_blocks - to_free; 1930 /*
1871 mdb = ext4_calc_metadata_amount(inode, total); 1931 * We can release all of the reserved metadata blocks
1872 1932 * only when we have written all of the delayed
1873 /* figure out how many metablocks to release */ 1933 * allocation blocks.
1874 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1934 */
1875 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1935 to_free += ei->i_reserved_meta_blocks;
1876 1936 ei->i_reserved_meta_blocks = 0;
1877 release = to_free + mdb_free; 1937 ei->i_da_metadata_calc_len = 0;
1878 1938 }
1879 /* update fs dirty blocks counter for truncate case */
1880 percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
1881 1939
1882 /* update per-inode reservations */ 1940 /* update fs dirty blocks counter */
1883 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); 1941 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
1884 EXT4_I(inode)->i_reserved_data_blocks -= to_free;
1885 1942
1886 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1887 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1888 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1943 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1889 1944
1890 vfs_dq_release_reservation_block(inode, release); 1945 dquot_release_reservation_block(inode, to_free);
1891} 1946}
1892 1947
1893static void ext4_da_page_release_reservation(struct page *page, 1948static void ext4_da_page_release_reservation(struct page *page,
@@ -2064,6 +2119,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2064 } else if (buffer_mapped(bh)) 2119 } else if (buffer_mapped(bh))
2065 BUG_ON(bh->b_blocknr != pblock); 2120 BUG_ON(bh->b_blocknr != pblock);
2066 2121
2122 if (buffer_uninit(exbh))
2123 set_buffer_uninit(bh);
2067 cur_logical++; 2124 cur_logical++;
2068 pblock++; 2125 pblock++;
2069 } while ((bh = bh->b_this_page) != head); 2126 } while ((bh = bh->b_this_page) != head);
@@ -2106,17 +2163,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2106 break; 2163 break;
2107 for (i = 0; i < nr_pages; i++) { 2164 for (i = 0; i < nr_pages; i++) {
2108 struct page *page = pvec.pages[i]; 2165 struct page *page = pvec.pages[i];
2109 index = page->index; 2166 if (page->index > end)
2110 if (index > end)
2111 break; 2167 break;
2112 index++;
2113
2114 BUG_ON(!PageLocked(page)); 2168 BUG_ON(!PageLocked(page));
2115 BUG_ON(PageWriteback(page)); 2169 BUG_ON(PageWriteback(page));
2116 block_invalidatepage(page, 0); 2170 block_invalidatepage(page, 0);
2117 ClearPageUptodate(page); 2171 ClearPageUptodate(page);
2118 unlock_page(page); 2172 unlock_page(page);
2119 } 2173 }
2174 index = pvec.pages[nr_pages - 1]->index + 1;
2175 pagevec_release(&pvec);
2120 } 2176 }
2121 return; 2177 return;
2122} 2178}
@@ -2192,10 +2248,12 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2192 * variables are updated after the blocks have been allocated. 2248 * variables are updated after the blocks have been allocated.
2193 */ 2249 */
2194 new.b_state = 0; 2250 new.b_state = 0;
2195 get_blocks_flags = (EXT4_GET_BLOCKS_CREATE | 2251 get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2196 EXT4_GET_BLOCKS_DELALLOC_RESERVE); 2252 if (ext4_should_dioread_nolock(mpd->inode))
2253 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2197 if (mpd->b_state & (1 << BH_Delay)) 2254 if (mpd->b_state & (1 << BH_Delay))
2198 get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE; 2255 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2256
2199 blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, 2257 blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
2200 &new, get_blocks_flags); 2258 &new, get_blocks_flags);
2201 if (blks < 0) { 2259 if (blks < 0) {
@@ -2493,7 +2551,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2493 * XXX: __block_prepare_write() unmaps passed block, 2551 * XXX: __block_prepare_write() unmaps passed block,
2494 * is it OK? 2552 * is it OK?
2495 */ 2553 */
2496 ret = ext4_da_reserve_space(inode, 1); 2554 ret = ext4_da_reserve_space(inode, iblock);
2497 if (ret) 2555 if (ret)
2498 /* not enough space to reserve */ 2556 /* not enough space to reserve */
2499 return ret; 2557 return ret;
@@ -2603,11 +2661,14 @@ static int __ext4_journalled_writepage(struct page *page,
2603 ret = err; 2661 ret = err;
2604 2662
2605 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); 2663 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
2606 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 2664 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
2607out: 2665out:
2608 return ret; 2666 return ret;
2609} 2667}
2610 2668
2669static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
2670static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2671
2611/* 2672/*
2612 * Note that we don't need to start a transaction unless we're journaling data 2673 * Note that we don't need to start a transaction unless we're journaling data
2613 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2674 * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2655,7 +2716,7 @@ static int ext4_writepage(struct page *page,
2655 int ret = 0; 2716 int ret = 0;
2656 loff_t size; 2717 loff_t size;
2657 unsigned int len; 2718 unsigned int len;
2658 struct buffer_head *page_bufs; 2719 struct buffer_head *page_bufs = NULL;
2659 struct inode *inode = page->mapping->host; 2720 struct inode *inode = page->mapping->host;
2660 2721
2661 trace_ext4_writepage(inode, page); 2722 trace_ext4_writepage(inode, page);
@@ -2731,7 +2792,11 @@ static int ext4_writepage(struct page *page,
2731 2792
2732 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2793 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2733 ret = nobh_writepage(page, noalloc_get_block_write, wbc); 2794 ret = nobh_writepage(page, noalloc_get_block_write, wbc);
2734 else 2795 else if (page_bufs && buffer_uninit(page_bufs)) {
2796 ext4_set_bh_endio(page_bufs, inode);
2797 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2798 wbc, ext4_end_io_buffer_write);
2799 } else
2735 ret = block_write_full_page(page, noalloc_get_block_write, 2800 ret = block_write_full_page(page, noalloc_get_block_write,
2736 wbc); 2801 wbc);
2737 2802
@@ -2967,8 +3032,7 @@ retry:
2967out_writepages: 3032out_writepages:
2968 if (!no_nrwrite_index_update) 3033 if (!no_nrwrite_index_update)
2969 wbc->no_nrwrite_index_update = 0; 3034 wbc->no_nrwrite_index_update = 0;
2970 if (wbc->nr_to_write > nr_to_writebump) 3035 wbc->nr_to_write -= nr_to_writebump;
2971 wbc->nr_to_write -= nr_to_writebump;
2972 wbc->range_start = range_start; 3036 wbc->range_start = range_start;
2973 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3037 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2974 return ret; 3038 return ret;
@@ -2993,11 +3057,18 @@ static int ext4_nonda_switch(struct super_block *sb)
2993 if (2 * free_blocks < 3 * dirty_blocks || 3057 if (2 * free_blocks < 3 * dirty_blocks ||
2994 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { 3058 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
2995 /* 3059 /*
2996 * free block count is less that 150% of dirty blocks 3060 * free block count is less than 150% of dirty blocks
2997 * or free blocks is less that watermark 3061 * or free blocks is less than watermark
2998 */ 3062 */
2999 return 1; 3063 return 1;
3000 } 3064 }
3065 /*
3066 * Even if we don't switch but are nearing capacity,
3067 * start pushing delalloc when 1/2 of free blocks are dirty.
3068 */
3069 if (free_blocks < 2 * dirty_blocks)
3070 writeback_inodes_sb_if_idle(sb);
3071
3001 return 0; 3072 return 0;
3002} 3073}
3003 3074
@@ -3005,7 +3076,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3005 loff_t pos, unsigned len, unsigned flags, 3076 loff_t pos, unsigned len, unsigned flags,
3006 struct page **pagep, void **fsdata) 3077 struct page **pagep, void **fsdata)
3007{ 3078{
3008 int ret, retries = 0; 3079 int ret, retries = 0, quota_retries = 0;
3009 struct page *page; 3080 struct page *page;
3010 pgoff_t index; 3081 pgoff_t index;
3011 unsigned from, to; 3082 unsigned from, to;
@@ -3064,6 +3135,22 @@ retry:
3064 3135
3065 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3136 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3066 goto retry; 3137 goto retry;
3138
3139 if ((ret == -EDQUOT) &&
3140 EXT4_I(inode)->i_reserved_meta_blocks &&
3141 (quota_retries++ < 3)) {
3142 /*
3143 * Since we often over-estimate the number of meta
3144 * data blocks required, we may sometimes get a
3145 * spurios out of quota error even though there would
3146 * be enough space once we write the data blocks and
3147 * find out how many meta data blocks were _really_
3148 * required. So try forcing the inode write to see if
3149 * that helps.
3150 */
3151 write_inode_now(inode, (quota_retries == 3));
3152 goto retry;
3153 }
3067out: 3154out:
3068 return ret; 3155 return ret;
3069} 3156}
@@ -3252,7 +3339,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3252 filemap_write_and_wait(mapping); 3339 filemap_write_and_wait(mapping);
3253 } 3340 }
3254 3341
3255 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 3342 if (EXT4_JOURNAL(inode) &&
3343 ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
3256 /* 3344 /*
3257 * This is a REALLY heavyweight approach, but the use of 3345 * This is a REALLY heavyweight approach, but the use of
3258 * bmap on dirty files is expected to be extremely rare: 3346 * bmap on dirty files is expected to be extremely rare:
@@ -3271,7 +3359,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3271 * everything they get. 3359 * everything they get.
3272 */ 3360 */
3273 3361
3274 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; 3362 ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
3275 journal = EXT4_JOURNAL(inode); 3363 journal = EXT4_JOURNAL(inode);
3276 jbd2_journal_lock_updates(journal); 3364 jbd2_journal_lock_updates(journal);
3277 err = jbd2_journal_flush(journal); 3365 err = jbd2_journal_flush(journal);
@@ -3296,11 +3384,45 @@ ext4_readpages(struct file *file, struct address_space *mapping,
3296 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3384 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
3297} 3385}
3298 3386
3387static void ext4_free_io_end(ext4_io_end_t *io)
3388{
3389 BUG_ON(!io);
3390 if (io->page)
3391 put_page(io->page);
3392 iput(io->inode);
3393 kfree(io);
3394}
3395
3396static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
3397{
3398 struct buffer_head *head, *bh;
3399 unsigned int curr_off = 0;
3400
3401 if (!page_has_buffers(page))
3402 return;
3403 head = bh = page_buffers(page);
3404 do {
3405 if (offset <= curr_off && test_clear_buffer_uninit(bh)
3406 && bh->b_private) {
3407 ext4_free_io_end(bh->b_private);
3408 bh->b_private = NULL;
3409 bh->b_end_io = NULL;
3410 }
3411 curr_off = curr_off + bh->b_size;
3412 bh = bh->b_this_page;
3413 } while (bh != head);
3414}
3415
3299static void ext4_invalidatepage(struct page *page, unsigned long offset) 3416static void ext4_invalidatepage(struct page *page, unsigned long offset)
3300{ 3417{
3301 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3418 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3302 3419
3303 /* 3420 /*
3421 * free any io_end structure allocated for buffers to be discarded
3422 */
3423 if (ext4_should_dioread_nolock(page->mapping->host))
3424 ext4_invalidatepage_free_endio(page, offset);
3425 /*
3304 * If it's a full truncate we just forget about the pending dirtying 3426 * If it's a full truncate we just forget about the pending dirtying
3305 */ 3427 */
3306 if (offset == 0) 3428 if (offset == 0)
@@ -3371,7 +3493,14 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3371 } 3493 }
3372 3494
3373retry: 3495retry:
3374 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3496 if (rw == READ && ext4_should_dioread_nolock(inode))
3497 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
3498 inode->i_sb->s_bdev, iov,
3499 offset, nr_segs,
3500 ext4_get_block, NULL);
3501 else
3502 ret = blockdev_direct_IO(rw, iocb, inode,
3503 inode->i_sb->s_bdev, iov,
3375 offset, nr_segs, 3504 offset, nr_segs,
3376 ext4_get_block, NULL); 3505 ext4_get_block, NULL);
3377 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3506 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3387,6 +3516,9 @@ retry:
3387 * but cannot extend i_size. Bail out and pretend 3516 * but cannot extend i_size. Bail out and pretend
3388 * the write failed... */ 3517 * the write failed... */
3389 ret = PTR_ERR(handle); 3518 ret = PTR_ERR(handle);
3519 if (inode->i_nlink)
3520 ext4_orphan_del(NULL, inode);
3521
3390 goto out; 3522 goto out;
3391 } 3523 }
3392 if (inode->i_nlink) 3524 if (inode->i_nlink)
@@ -3414,75 +3546,63 @@ out:
3414 return ret; 3546 return ret;
3415} 3547}
3416 3548
3417static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, 3549static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3418 struct buffer_head *bh_result, int create) 3550 struct buffer_head *bh_result, int create)
3419{ 3551{
3420 handle_t *handle = NULL; 3552 handle_t *handle = ext4_journal_current_handle();
3421 int ret = 0; 3553 int ret = 0;
3422 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 3554 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3423 int dio_credits; 3555 int dio_credits;
3556 int started = 0;
3424 3557
3425 ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", 3558 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3426 inode->i_ino, create); 3559 inode->i_ino, create);
3427 /* 3560 /*
3428 * DIO VFS code passes create = 0 flag for write to 3561 * ext4_get_block in prepare for a DIO write or buffer write.
3429 * the middle of file. It does this to avoid block 3562 * We allocate an uinitialized extent if blocks haven't been allocated.
3430 * allocation for holes, to prevent expose stale data 3563 * The extent will be converted to initialized after IO complete.
3431 * out when there is parallel buffered read (which does
3432 * not hold the i_mutex lock) while direct IO write has
3433 * not completed. DIO request on holes finally falls back
3434 * to buffered IO for this reason.
3435 *
3436 * For ext4 extent based file, since we support fallocate,
3437 * new allocated extent as uninitialized, for holes, we
3438 * could fallocate blocks for holes, thus parallel
3439 * buffered IO read will zero out the page when read on
3440 * a hole while parallel DIO write to the hole has not completed.
3441 *
3442 * when we come here, we know it's a direct IO write to
3443 * to the middle of file (<i_size)
3444 * so it's safe to override the create flag from VFS.
3445 */ 3564 */
3446 create = EXT4_GET_BLOCKS_DIO_CREATE_EXT; 3565 create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3447 3566
3448 if (max_blocks > DIO_MAX_BLOCKS) 3567 if (!handle) {
3449 max_blocks = DIO_MAX_BLOCKS; 3568 if (max_blocks > DIO_MAX_BLOCKS)
3450 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 3569 max_blocks = DIO_MAX_BLOCKS;
3451 handle = ext4_journal_start(inode, dio_credits); 3570 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3452 if (IS_ERR(handle)) { 3571 handle = ext4_journal_start(inode, dio_credits);
3453 ret = PTR_ERR(handle); 3572 if (IS_ERR(handle)) {
3454 goto out; 3573 ret = PTR_ERR(handle);
3574 goto out;
3575 }
3576 started = 1;
3455 } 3577 }
3578
3456 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, 3579 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3457 create); 3580 create);
3458 if (ret > 0) { 3581 if (ret > 0) {
3459 bh_result->b_size = (ret << inode->i_blkbits); 3582 bh_result->b_size = (ret << inode->i_blkbits);
3460 ret = 0; 3583 ret = 0;
3461 } 3584 }
3462 ext4_journal_stop(handle); 3585 if (started)
3586 ext4_journal_stop(handle);
3463out: 3587out:
3464 return ret; 3588 return ret;
3465} 3589}
3466 3590
3467static void ext4_free_io_end(ext4_io_end_t *io) 3591static void dump_completed_IO(struct inode * inode)
3468{
3469 BUG_ON(!io);
3470 iput(io->inode);
3471 kfree(io);
3472}
3473static void dump_aio_dio_list(struct inode * inode)
3474{ 3592{
3475#ifdef EXT4_DEBUG 3593#ifdef EXT4_DEBUG
3476 struct list_head *cur, *before, *after; 3594 struct list_head *cur, *before, *after;
3477 ext4_io_end_t *io, *io0, *io1; 3595 ext4_io_end_t *io, *io0, *io1;
3596 unsigned long flags;
3478 3597
3479 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ 3598 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
3480 ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); 3599 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
3481 return; 3600 return;
3482 } 3601 }
3483 3602
3484 ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); 3603 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
3485 list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ 3604 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3605 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
3486 cur = &io->list; 3606 cur = &io->list;
3487 before = cur->prev; 3607 before = cur->prev;
3488 io0 = container_of(before, ext4_io_end_t, list); 3608 io0 = container_of(before, ext4_io_end_t, list);
@@ -3492,32 +3612,31 @@ static void dump_aio_dio_list(struct inode * inode)
3492 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 3612 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3493 io, inode->i_ino, io0, io1); 3613 io, inode->i_ino, io0, io1);
3494 } 3614 }
3615 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3495#endif 3616#endif
3496} 3617}
3497 3618
3498/* 3619/*
3499 * check a range of space and convert unwritten extents to written. 3620 * check a range of space and convert unwritten extents to written.
3500 */ 3621 */
3501static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) 3622static int ext4_end_io_nolock(ext4_io_end_t *io)
3502{ 3623{
3503 struct inode *inode = io->inode; 3624 struct inode *inode = io->inode;
3504 loff_t offset = io->offset; 3625 loff_t offset = io->offset;
3505 size_t size = io->size; 3626 ssize_t size = io->size;
3506 int ret = 0; 3627 int ret = 0;
3507 3628
3508 ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," 3629 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
3509 "list->prev 0x%p\n", 3630 "list->prev 0x%p\n",
3510 io, inode->i_ino, io->list.next, io->list.prev); 3631 io, inode->i_ino, io->list.next, io->list.prev);
3511 3632
3512 if (list_empty(&io->list)) 3633 if (list_empty(&io->list))
3513 return ret; 3634 return ret;
3514 3635
3515 if (io->flag != DIO_AIO_UNWRITTEN) 3636 if (io->flag != EXT4_IO_UNWRITTEN)
3516 return ret; 3637 return ret;
3517 3638
3518 if (offset + size <= i_size_read(inode)) 3639 ret = ext4_convert_unwritten_extents(inode, offset, size);
3519 ret = ext4_convert_unwritten_extents(inode, offset, size);
3520
3521 if (ret < 0) { 3640 if (ret < 0) {
3522 printk(KERN_EMERG "%s: failed to convert unwritten" 3641 printk(KERN_EMERG "%s: failed to convert unwritten"
3523 "extents to written extents, error is %d" 3642 "extents to written extents, error is %d"
@@ -3530,50 +3649,64 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
3530 io->flag = 0; 3649 io->flag = 0;
3531 return ret; 3650 return ret;
3532} 3651}
3652
3533/* 3653/*
3534 * work on completed aio dio IO, to convert unwritten extents to extents 3654 * work on completed aio dio IO, to convert unwritten extents to extents
3535 */ 3655 */
3536static void ext4_end_aio_dio_work(struct work_struct *work) 3656static void ext4_end_io_work(struct work_struct *work)
3537{ 3657{
3538 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); 3658 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3539 struct inode *inode = io->inode; 3659 struct inode *inode = io->inode;
3540 int ret = 0; 3660 struct ext4_inode_info *ei = EXT4_I(inode);
3661 unsigned long flags;
3662 int ret;
3541 3663
3542 mutex_lock(&inode->i_mutex); 3664 mutex_lock(&inode->i_mutex);
3543 ret = ext4_end_aio_dio_nolock(io); 3665 ret = ext4_end_io_nolock(io);
3544 if (ret >= 0) { 3666 if (ret < 0) {
3545 if (!list_empty(&io->list)) 3667 mutex_unlock(&inode->i_mutex);
3546 list_del_init(&io->list); 3668 return;
3547 ext4_free_io_end(io);
3548 } 3669 }
3670
3671 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3672 if (!list_empty(&io->list))
3673 list_del_init(&io->list);
3674 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3549 mutex_unlock(&inode->i_mutex); 3675 mutex_unlock(&inode->i_mutex);
3676 ext4_free_io_end(io);
3550} 3677}
3678
3551/* 3679/*
3552 * This function is called from ext4_sync_file(). 3680 * This function is called from ext4_sync_file().
3553 * 3681 *
3554 * When AIO DIO IO is completed, the work to convert unwritten 3682 * When IO is completed, the work to convert unwritten extents to
3555 * extents to written is queued on workqueue but may not get immediately 3683 * written is queued on workqueue but may not get immediately
3556 * scheduled. When fsync is called, we need to ensure the 3684 * scheduled. When fsync is called, we need to ensure the
3557 * conversion is complete before fsync returns. 3685 * conversion is complete before fsync returns.
3558 * The inode keeps track of a list of completed AIO from DIO path 3686 * The inode keeps track of a list of pending/completed IO that
3559 * that might needs to do the conversion. This function walks through 3687 * might needs to do the conversion. This function walks through
3560 * the list and convert the related unwritten extents to written. 3688 * the list and convert the related unwritten extents for completed IO
3689 * to written.
3690 * The function return the number of pending IOs on success.
3561 */ 3691 */
3562int flush_aio_dio_completed_IO(struct inode *inode) 3692int flush_completed_IO(struct inode *inode)
3563{ 3693{
3564 ext4_io_end_t *io; 3694 ext4_io_end_t *io;
3695 struct ext4_inode_info *ei = EXT4_I(inode);
3696 unsigned long flags;
3565 int ret = 0; 3697 int ret = 0;
3566 int ret2 = 0; 3698 int ret2 = 0;
3567 3699
3568 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) 3700 if (list_empty(&ei->i_completed_io_list))
3569 return ret; 3701 return ret;
3570 3702
3571 dump_aio_dio_list(inode); 3703 dump_completed_IO(inode);
3572 while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ 3704 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3573 io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, 3705 while (!list_empty(&ei->i_completed_io_list)){
3706 io = list_entry(ei->i_completed_io_list.next,
3574 ext4_io_end_t, list); 3707 ext4_io_end_t, list);
3575 /* 3708 /*
3576 * Calling ext4_end_aio_dio_nolock() to convert completed 3709 * Calling ext4_end_io_nolock() to convert completed
3577 * IO to written. 3710 * IO to written.
3578 * 3711 *
3579 * When ext4_sync_file() is called, run_queue() may already 3712 * When ext4_sync_file() is called, run_queue() may already
@@ -3586,20 +3719,23 @@ int flush_aio_dio_completed_IO(struct inode *inode)
3586 * avoid double converting from both fsync and background work 3719 * avoid double converting from both fsync and background work
3587 * queue work. 3720 * queue work.
3588 */ 3721 */
3589 ret = ext4_end_aio_dio_nolock(io); 3722 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3723 ret = ext4_end_io_nolock(io);
3724 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3590 if (ret < 0) 3725 if (ret < 0)
3591 ret2 = ret; 3726 ret2 = ret;
3592 else 3727 else
3593 list_del_init(&io->list); 3728 list_del_init(&io->list);
3594 } 3729 }
3730 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3595 return (ret2 < 0) ? ret2 : 0; 3731 return (ret2 < 0) ? ret2 : 0;
3596} 3732}
3597 3733
3598static ext4_io_end_t *ext4_init_io_end (struct inode *inode) 3734static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
3599{ 3735{
3600 ext4_io_end_t *io = NULL; 3736 ext4_io_end_t *io = NULL;
3601 3737
3602 io = kmalloc(sizeof(*io), GFP_NOFS); 3738 io = kmalloc(sizeof(*io), flags);
3603 3739
3604 if (io) { 3740 if (io) {
3605 igrab(inode); 3741 igrab(inode);
@@ -3607,8 +3743,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3607 io->flag = 0; 3743 io->flag = 0;
3608 io->offset = 0; 3744 io->offset = 0;
3609 io->size = 0; 3745 io->size = 0;
3610 io->error = 0; 3746 io->page = NULL;
3611 INIT_WORK(&io->work, ext4_end_aio_dio_work); 3747 INIT_WORK(&io->work, ext4_end_io_work);
3612 INIT_LIST_HEAD(&io->list); 3748 INIT_LIST_HEAD(&io->list);
3613 } 3749 }
3614 3750
@@ -3620,6 +3756,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3620{ 3756{
3621 ext4_io_end_t *io_end = iocb->private; 3757 ext4_io_end_t *io_end = iocb->private;
3622 struct workqueue_struct *wq; 3758 struct workqueue_struct *wq;
3759 unsigned long flags;
3760 struct ext4_inode_info *ei;
3623 3761
3624 /* if not async direct IO or dio with 0 bytes write, just return */ 3762 /* if not async direct IO or dio with 0 bytes write, just return */
3625 if (!io_end || !size) 3763 if (!io_end || !size)
@@ -3631,7 +3769,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3631 size); 3769 size);
3632 3770
3633 /* if not aio dio with unwritten extents, just free io and return */ 3771 /* if not aio dio with unwritten extents, just free io and return */
3634 if (io_end->flag != DIO_AIO_UNWRITTEN){ 3772 if (io_end->flag != EXT4_IO_UNWRITTEN){
3635 ext4_free_io_end(io_end); 3773 ext4_free_io_end(io_end);
3636 iocb->private = NULL; 3774 iocb->private = NULL;
3637 return; 3775 return;
@@ -3639,16 +3777,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3639 3777
3640 io_end->offset = offset; 3778 io_end->offset = offset;
3641 io_end->size = size; 3779 io_end->size = size;
3780 io_end->flag = EXT4_IO_UNWRITTEN;
3642 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3781 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3643 3782
3644 /* queue the work to convert unwritten extents to written */ 3783 /* queue the work to convert unwritten extents to written */
3645 queue_work(wq, &io_end->work); 3784 queue_work(wq, &io_end->work);
3646 3785
3647 /* Add the io_end to per-inode completed aio dio list*/ 3786 /* Add the io_end to per-inode completed aio dio list*/
3648 list_add_tail(&io_end->list, 3787 ei = EXT4_I(io_end->inode);
3649 &EXT4_I(io_end->inode)->i_aio_dio_complete_list); 3788 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
3789 list_add_tail(&io_end->list, &ei->i_completed_io_list);
3790 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3650 iocb->private = NULL; 3791 iocb->private = NULL;
3651} 3792}
3793
3794static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
3795{
3796 ext4_io_end_t *io_end = bh->b_private;
3797 struct workqueue_struct *wq;
3798 struct inode *inode;
3799 unsigned long flags;
3800
3801 if (!test_clear_buffer_uninit(bh) || !io_end)
3802 goto out;
3803
3804 if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
3805 printk("sb umounted, discard end_io request for inode %lu\n",
3806 io_end->inode->i_ino);
3807 ext4_free_io_end(io_end);
3808 goto out;
3809 }
3810
3811 io_end->flag = EXT4_IO_UNWRITTEN;
3812 inode = io_end->inode;
3813
3814 /* Add the io_end to per-inode completed io list*/
3815 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
3816 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
3817 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
3818
3819 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
3820 /* queue the work to convert unwritten extents to written */
3821 queue_work(wq, &io_end->work);
3822out:
3823 bh->b_private = NULL;
3824 bh->b_end_io = NULL;
3825 clear_buffer_uninit(bh);
3826 end_buffer_async_write(bh, uptodate);
3827}
3828
3829static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
3830{
3831 ext4_io_end_t *io_end;
3832 struct page *page = bh->b_page;
3833 loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
3834 size_t size = bh->b_size;
3835
3836retry:
3837 io_end = ext4_init_io_end(inode, GFP_ATOMIC);
3838 if (!io_end) {
3839 if (printk_ratelimit())
3840 printk(KERN_WARNING "%s: allocation fail\n", __func__);
3841 schedule();
3842 goto retry;
3843 }
3844 io_end->offset = offset;
3845 io_end->size = size;
3846 /*
3847 * We need to hold a reference to the page to make sure it
3848 * doesn't get evicted before ext4_end_io_work() has a chance
3849 * to convert the extent from written to unwritten.
3850 */
3851 io_end->page = page;
3852 get_page(io_end->page);
3853
3854 bh->b_private = io_end;
3855 bh->b_end_io = ext4_end_io_buffer_write;
3856 return 0;
3857}
3858
3652/* 3859/*
3653 * For ext4 extent files, ext4 will do direct-io write to holes, 3860 * For ext4 extent files, ext4 will do direct-io write to holes,
3654 * preallocated extents, and those write extend the file, no need to 3861 * preallocated extents, and those write extend the file, no need to
@@ -3702,7 +3909,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3702 iocb->private = NULL; 3909 iocb->private = NULL;
3703 EXT4_I(inode)->cur_aio_dio = NULL; 3910 EXT4_I(inode)->cur_aio_dio = NULL;
3704 if (!is_sync_kiocb(iocb)) { 3911 if (!is_sync_kiocb(iocb)) {
3705 iocb->private = ext4_init_io_end(inode); 3912 iocb->private = ext4_init_io_end(inode, GFP_NOFS);
3706 if (!iocb->private) 3913 if (!iocb->private)
3707 return -ENOMEM; 3914 return -ENOMEM;
3708 /* 3915 /*
@@ -3718,7 +3925,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3718 ret = blockdev_direct_IO(rw, iocb, inode, 3925 ret = blockdev_direct_IO(rw, iocb, inode,
3719 inode->i_sb->s_bdev, iov, 3926 inode->i_sb->s_bdev, iov,
3720 offset, nr_segs, 3927 offset, nr_segs,
3721 ext4_get_block_dio_write, 3928 ext4_get_block_write,
3722 ext4_end_io_dio); 3929 ext4_end_io_dio);
3723 if (iocb->private) 3930 if (iocb->private)
3724 EXT4_I(inode)->cur_aio_dio = NULL; 3931 EXT4_I(inode)->cur_aio_dio = NULL;
@@ -3739,8 +3946,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3739 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3946 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3740 ext4_free_io_end(iocb->private); 3947 ext4_free_io_end(iocb->private);
3741 iocb->private = NULL; 3948 iocb->private = NULL;
3742 } else if (ret > 0 && (EXT4_I(inode)->i_state & 3949 } else if (ret > 0 && ext4_test_inode_state(inode,
3743 EXT4_STATE_DIO_UNWRITTEN)) { 3950 EXT4_STATE_DIO_UNWRITTEN)) {
3744 int err; 3951 int err;
3745 /* 3952 /*
3746 * for non AIO case, since the IO is already 3953 * for non AIO case, since the IO is already
@@ -3750,7 +3957,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3750 offset, ret); 3957 offset, ret);
3751 if (err < 0) 3958 if (err < 0)
3752 ret = err; 3959 ret = err;
3753 EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; 3960 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3754 } 3961 }
3755 return ret; 3962 return ret;
3756 } 3963 }
@@ -4081,18 +4288,27 @@ no_top:
4081 * We release `count' blocks on disk, but (last - first) may be greater 4288 * We release `count' blocks on disk, but (last - first) may be greater
4082 * than `count' because there can be holes in there. 4289 * than `count' because there can be holes in there.
4083 */ 4290 */
4084static void ext4_clear_blocks(handle_t *handle, struct inode *inode, 4291static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4085 struct buffer_head *bh, 4292 struct buffer_head *bh,
4086 ext4_fsblk_t block_to_free, 4293 ext4_fsblk_t block_to_free,
4087 unsigned long count, __le32 *first, 4294 unsigned long count, __le32 *first,
4088 __le32 *last) 4295 __le32 *last)
4089{ 4296{
4090 __le32 *p; 4297 __le32 *p;
4091 int flags = EXT4_FREE_BLOCKS_FORGET; 4298 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
4092 4299
4093 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 4300 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4094 flags |= EXT4_FREE_BLOCKS_METADATA; 4301 flags |= EXT4_FREE_BLOCKS_METADATA;
4095 4302
4303 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4304 count)) {
4305 ext4_error(inode->i_sb, "inode #%lu: "
4306 "attempt to clear blocks %llu len %lu, invalid",
4307 inode->i_ino, (unsigned long long) block_to_free,
4308 count);
4309 return 1;
4310 }
4311
4096 if (try_to_extend_transaction(handle, inode)) { 4312 if (try_to_extend_transaction(handle, inode)) {
4097 if (bh) { 4313 if (bh) {
4098 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4314 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -4111,6 +4327,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
4111 *p = 0; 4327 *p = 0;
4112 4328
4113 ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); 4329 ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
4330 return 0;
4114} 4331}
4115 4332
4116/** 4333/**
@@ -4166,9 +4383,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4166 } else if (nr == block_to_free + count) { 4383 } else if (nr == block_to_free + count) {
4167 count++; 4384 count++;
4168 } else { 4385 } else {
4169 ext4_clear_blocks(handle, inode, this_bh, 4386 if (ext4_clear_blocks(handle, inode, this_bh,
4170 block_to_free, 4387 block_to_free, count,
4171 count, block_to_free_p, p); 4388 block_to_free_p, p))
4389 break;
4172 block_to_free = nr; 4390 block_to_free = nr;
4173 block_to_free_p = p; 4391 block_to_free_p = p;
4174 count = 1; 4392 count = 1;
@@ -4192,7 +4410,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4192 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 4410 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4193 ext4_handle_dirty_metadata(handle, inode, this_bh); 4411 ext4_handle_dirty_metadata(handle, inode, this_bh);
4194 else 4412 else
4195 ext4_error(inode->i_sb, __func__, 4413 ext4_error(inode->i_sb,
4196 "circular indirect block detected, " 4414 "circular indirect block detected, "
4197 "inode=%lu, block=%llu", 4415 "inode=%lu, block=%llu",
4198 inode->i_ino, 4416 inode->i_ino,
@@ -4232,6 +4450,16 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4232 if (!nr) 4450 if (!nr)
4233 continue; /* A hole */ 4451 continue; /* A hole */
4234 4452
4453 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4454 nr, 1)) {
4455 ext4_error(inode->i_sb,
4456 "indirect mapped block in inode "
4457 "#%lu invalid (level %d, blk #%lu)",
4458 inode->i_ino, depth,
4459 (unsigned long) nr);
4460 break;
4461 }
4462
4235 /* Go read the buffer for the next level down */ 4463 /* Go read the buffer for the next level down */
4236 bh = sb_bread(inode->i_sb, nr); 4464 bh = sb_bread(inode->i_sb, nr);
4237 4465
@@ -4240,7 +4468,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4240 * (should be rare). 4468 * (should be rare).
4241 */ 4469 */
4242 if (!bh) { 4470 if (!bh) {
4243 ext4_error(inode->i_sb, "ext4_free_branches", 4471 ext4_error(inode->i_sb,
4244 "Read failure, inode=%lu, block=%llu", 4472 "Read failure, inode=%lu, block=%llu",
4245 inode->i_ino, nr); 4473 inode->i_ino, nr);
4246 continue; 4474 continue;
@@ -4384,8 +4612,10 @@ void ext4_truncate(struct inode *inode)
4384 if (!ext4_can_truncate(inode)) 4612 if (!ext4_can_truncate(inode))
4385 return; 4613 return;
4386 4614
4615 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
4616
4387 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4617 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4388 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; 4618 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4389 4619
4390 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 4620 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
4391 ext4_ext_truncate(inode); 4621 ext4_ext_truncate(inode);
@@ -4555,9 +4785,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
4555 4785
4556 bh = sb_getblk(sb, block); 4786 bh = sb_getblk(sb, block);
4557 if (!bh) { 4787 if (!bh) {
4558 ext4_error(sb, "ext4_get_inode_loc", "unable to read " 4788 ext4_error(sb, "unable to read inode block - "
4559 "inode block - inode=%lu, block=%llu", 4789 "inode=%lu, block=%llu", inode->i_ino, block);
4560 inode->i_ino, block);
4561 return -EIO; 4790 return -EIO;
4562 } 4791 }
4563 if (!buffer_uptodate(bh)) { 4792 if (!buffer_uptodate(bh)) {
@@ -4655,9 +4884,8 @@ make_io:
4655 submit_bh(READ_META, bh); 4884 submit_bh(READ_META, bh);
4656 wait_on_buffer(bh); 4885 wait_on_buffer(bh);
4657 if (!buffer_uptodate(bh)) { 4886 if (!buffer_uptodate(bh)) {
4658 ext4_error(sb, __func__, 4887 ext4_error(sb, "unable to read inode block - inode=%lu,"
4659 "unable to read inode block - inode=%lu, " 4888 " block=%llu", inode->i_ino, block);
4660 "block=%llu", inode->i_ino, block);
4661 brelse(bh); 4889 brelse(bh);
4662 return -EIO; 4890 return -EIO;
4663 } 4891 }
@@ -4671,7 +4899,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
4671{ 4899{
4672 /* We have all inode data except xattrs in memory here. */ 4900 /* We have all inode data except xattrs in memory here. */
4673 return __ext4_get_inode_loc(inode, iloc, 4901 return __ext4_get_inode_loc(inode, iloc,
4674 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); 4902 !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
4675} 4903}
4676 4904
4677void ext4_set_inode_flags(struct inode *inode) 4905void ext4_set_inode_flags(struct inode *inode)
@@ -4765,7 +4993,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4765 } 4993 }
4766 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4994 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
4767 4995
4768 ei->i_state = 0; 4996 ei->i_state_flags = 0;
4769 ei->i_dir_start_lookup = 0; 4997 ei->i_dir_start_lookup = 0;
4770 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4998 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
4771 /* We now have enough fields to check if the inode was active or not. 4999 /* We now have enough fields to check if the inode was active or not.
@@ -4848,7 +5076,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4848 EXT4_GOOD_OLD_INODE_SIZE + 5076 EXT4_GOOD_OLD_INODE_SIZE +
4849 ei->i_extra_isize; 5077 ei->i_extra_isize;
4850 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 5078 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
4851 ei->i_state |= EXT4_STATE_XATTR; 5079 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
4852 } 5080 }
4853 } else 5081 } else
4854 ei->i_extra_isize = 0; 5082 ei->i_extra_isize = 0;
@@ -4868,8 +5096,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4868 ret = 0; 5096 ret = 0;
4869 if (ei->i_file_acl && 5097 if (ei->i_file_acl &&
4870 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { 5098 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
4871 ext4_error(sb, __func__, 5099 ext4_error(sb, "bad extended attribute block %llu inode #%lu",
4872 "bad extended attribute block %llu in inode #%lu",
4873 ei->i_file_acl, inode->i_ino); 5100 ei->i_file_acl, inode->i_ino);
4874 ret = -EIO; 5101 ret = -EIO;
4875 goto bad_inode; 5102 goto bad_inode;
@@ -4915,8 +5142,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4915 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 5142 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4916 } else { 5143 } else {
4917 ret = -EIO; 5144 ret = -EIO;
4918 ext4_error(inode->i_sb, __func__, 5145 ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
4919 "bogus i_mode (%o) for inode=%lu",
4920 inode->i_mode, inode->i_ino); 5146 inode->i_mode, inode->i_ino);
4921 goto bad_inode; 5147 goto bad_inode;
4922 } 5148 }
@@ -4988,7 +5214,7 @@ static int ext4_do_update_inode(handle_t *handle,
4988 5214
4989 /* For fields not not tracking in the in-memory inode, 5215 /* For fields not not tracking in the in-memory inode,
4990 * initialise them to zero for new inodes. */ 5216 * initialise them to zero for new inodes. */
4991 if (ei->i_state & EXT4_STATE_NEW) 5217 if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
4992 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 5218 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
4993 5219
4994 ext4_get_inode_flags(ei); 5220 ext4_get_inode_flags(ei);
@@ -5052,7 +5278,7 @@ static int ext4_do_update_inode(handle_t *handle,
5052 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 5278 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
5053 sb->s_dirt = 1; 5279 sb->s_dirt = 1;
5054 ext4_handle_sync(handle); 5280 ext4_handle_sync(handle);
5055 err = ext4_handle_dirty_metadata(handle, inode, 5281 err = ext4_handle_dirty_metadata(handle, NULL,
5056 EXT4_SB(sb)->s_sbh); 5282 EXT4_SB(sb)->s_sbh);
5057 } 5283 }
5058 } 5284 }
@@ -5081,10 +5307,10 @@ static int ext4_do_update_inode(handle_t *handle,
5081 } 5307 }
5082 5308
5083 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 5309 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
5084 rc = ext4_handle_dirty_metadata(handle, inode, bh); 5310 rc = ext4_handle_dirty_metadata(handle, NULL, bh);
5085 if (!err) 5311 if (!err)
5086 err = rc; 5312 err = rc;
5087 ei->i_state &= ~EXT4_STATE_NEW; 5313 ext4_clear_inode_state(inode, EXT4_STATE_NEW);
5088 5314
5089 ext4_update_inode_fsync_trans(handle, inode, 0); 5315 ext4_update_inode_fsync_trans(handle, inode, 0);
5090out_brelse: 5316out_brelse:
@@ -5128,7 +5354,7 @@ out_brelse:
5128 * `stuff()' is running, and the new i_size will be lost. Plus the inode 5354 * `stuff()' is running, and the new i_size will be lost. Plus the inode
5129 * will no longer be on the superblock's dirty inode list. 5355 * will no longer be on the superblock's dirty inode list.
5130 */ 5356 */
5131int ext4_write_inode(struct inode *inode, int wait) 5357int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5132{ 5358{
5133 int err; 5359 int err;
5134 5360
@@ -5142,26 +5368,25 @@ int ext4_write_inode(struct inode *inode, int wait)
5142 return -EIO; 5368 return -EIO;
5143 } 5369 }
5144 5370
5145 if (!wait) 5371 if (wbc->sync_mode != WB_SYNC_ALL)
5146 return 0; 5372 return 0;
5147 5373
5148 err = ext4_force_commit(inode->i_sb); 5374 err = ext4_force_commit(inode->i_sb);
5149 } else { 5375 } else {
5150 struct ext4_iloc iloc; 5376 struct ext4_iloc iloc;
5151 5377
5152 err = ext4_get_inode_loc(inode, &iloc); 5378 err = __ext4_get_inode_loc(inode, &iloc, 0);
5153 if (err) 5379 if (err)
5154 return err; 5380 return err;
5155 if (wait) 5381 if (wbc->sync_mode == WB_SYNC_ALL)
5156 sync_dirty_buffer(iloc.bh); 5382 sync_dirty_buffer(iloc.bh);
5157 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5383 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5158 ext4_error(inode->i_sb, __func__, 5384 ext4_error(inode->i_sb, "IO error syncing inode, "
5159 "IO error syncing inode, " 5385 "inode=%lu, block=%llu", inode->i_ino,
5160 "inode=%lu, block=%llu",
5161 inode->i_ino,
5162 (unsigned long long)iloc.bh->b_blocknr); 5386 (unsigned long long)iloc.bh->b_blocknr);
5163 err = -EIO; 5387 err = -EIO;
5164 } 5388 }
5389 brelse(iloc.bh);
5165 } 5390 }
5166 return err; 5391 return err;
5167} 5392}
@@ -5200,6 +5425,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5200 if (error) 5425 if (error)
5201 return error; 5426 return error;
5202 5427
5428 if (ia_valid & ATTR_SIZE)
5429 dquot_initialize(inode);
5203 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 5430 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
5204 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 5431 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
5205 handle_t *handle; 5432 handle_t *handle;
@@ -5212,7 +5439,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5212 error = PTR_ERR(handle); 5439 error = PTR_ERR(handle);
5213 goto err_out; 5440 goto err_out;
5214 } 5441 }
5215 error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; 5442 error = dquot_transfer(inode, attr);
5216 if (error) { 5443 if (error) {
5217 ext4_journal_stop(handle); 5444 ext4_journal_stop(handle);
5218 return error; 5445 return error;
@@ -5239,7 +5466,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5239 } 5466 }
5240 5467
5241 if (S_ISREG(inode->i_mode) && 5468 if (S_ISREG(inode->i_mode) &&
5242 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 5469 attr->ia_valid & ATTR_SIZE &&
5470 (attr->ia_size < inode->i_size ||
5471 (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
5243 handle_t *handle; 5472 handle_t *handle;
5244 5473
5245 handle = ext4_journal_start(inode, 3); 5474 handle = ext4_journal_start(inode, 3);
@@ -5270,6 +5499,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5270 goto err_out; 5499 goto err_out;
5271 } 5500 }
5272 } 5501 }
5502 /* ext4_truncate will clear the flag */
5503 if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
5504 ext4_truncate(inode);
5273 } 5505 }
5274 5506
5275 rc = inode_setattr(inode, attr); 5507 rc = inode_setattr(inode, attr);
@@ -5508,8 +5740,8 @@ static int ext4_expand_extra_isize(struct inode *inode,
5508 entry = IFIRST(header); 5740 entry = IFIRST(header);
5509 5741
5510 /* No extended attributes present */ 5742 /* No extended attributes present */
5511 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || 5743 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
5512 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 5744 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
5513 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, 5745 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
5514 new_extra_isize); 5746 new_extra_isize);
5515 EXT4_I(inode)->i_extra_isize = new_extra_isize; 5747 EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -5553,7 +5785,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5553 err = ext4_reserve_inode_write(handle, inode, &iloc); 5785 err = ext4_reserve_inode_write(handle, inode, &iloc);
5554 if (ext4_handle_valid(handle) && 5786 if (ext4_handle_valid(handle) &&
5555 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5787 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
5556 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { 5788 !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
5557 /* 5789 /*
5558 * We need extra buffer credits since we may write into EA block 5790 * We need extra buffer credits since we may write into EA block
5559 * with this same handle. If journal_extend fails, then it will 5791 * with this same handle. If journal_extend fails, then it will
@@ -5567,10 +5799,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5567 sbi->s_want_extra_isize, 5799 sbi->s_want_extra_isize,
5568 iloc, handle); 5800 iloc, handle);
5569 if (ret) { 5801 if (ret) {
5570 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 5802 ext4_set_inode_state(inode,
5803 EXT4_STATE_NO_EXPAND);
5571 if (mnt_count != 5804 if (mnt_count !=
5572 le16_to_cpu(sbi->s_es->s_mnt_count)) { 5805 le16_to_cpu(sbi->s_es->s_mnt_count)) {
5573 ext4_warning(inode->i_sb, __func__, 5806 ext4_warning(inode->i_sb,
5574 "Unable to expand inode %lu. Delete" 5807 "Unable to expand inode %lu. Delete"
5575 " some EAs or run e2fsck.", 5808 " some EAs or run e2fsck.",
5576 inode->i_ino); 5809 inode->i_ino);
@@ -5592,7 +5825,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5592 * i_size has been changed by generic_commit_write() and we thus need 5825 * i_size has been changed by generic_commit_write() and we thus need
5593 * to include the updated inode in the current transaction. 5826 * to include the updated inode in the current transaction.
5594 * 5827 *
5595 * Also, vfs_dq_alloc_block() will always dirty the inode when blocks 5828 * Also, dquot_alloc_block() will always dirty the inode when blocks
5596 * are allocated to the file. 5829 * are allocated to the file.
5597 * 5830 *
5598 * If the inode is marked synchronous, we don't honour that here - doing 5831 * If the inode is marked synchronous, we don't honour that here - doing
@@ -5634,7 +5867,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
5634 err = jbd2_journal_get_write_access(handle, iloc.bh); 5867 err = jbd2_journal_get_write_access(handle, iloc.bh);
5635 if (!err) 5868 if (!err)
5636 err = ext4_handle_dirty_metadata(handle, 5869 err = ext4_handle_dirty_metadata(handle,
5637 inode, 5870 NULL,
5638 iloc.bh); 5871 iloc.bh);
5639 brelse(iloc.bh); 5872 brelse(iloc.bh);
5640 } 5873 }
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index b63d193126db..016d0249294f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
92 flags &= ~EXT4_EXTENTS_FL; 92 flags &= ~EXT4_EXTENTS_FL;
93 } 93 }
94 94
95 if (flags & EXT4_EOFBLOCKS_FL) {
96 /* we don't support adding EOFBLOCKS flag */
97 if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
98 err = -EOPNOTSUPP;
99 goto flags_out;
100 }
101 } else if (oldflags & EXT4_EOFBLOCKS_FL)
102 ext4_truncate(inode);
103
95 handle = ext4_journal_start(inode, 1); 104 handle = ext4_journal_start(inode, 1);
96 if (IS_ERR(handle)) { 105 if (IS_ERR(handle)) {
97 err = PTR_ERR(handle); 106 err = PTR_ERR(handle);
@@ -249,7 +258,8 @@ setversion_out:
249 if (me.moved_len > 0) 258 if (me.moved_len > 0)
250 file_remove_suid(donor_filp); 259 file_remove_suid(donor_filp);
251 260
252 if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) 261 if (copy_to_user((struct move_extent __user *)arg,
262 &me, sizeof(me)))
253 err = -EFAULT; 263 err = -EFAULT;
254mext_out: 264mext_out:
255 fput(donor_filp); 265 fput(donor_filp);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d34afad3e137..b423a364dca3 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -23,6 +23,7 @@
23 23
24#include "mballoc.h" 24#include "mballoc.h"
25#include <linux/debugfs.h> 25#include <linux/debugfs.h>
26#include <linux/slab.h>
26#include <trace/events/ext4.h> 27#include <trace/events/ext4.h>
27 28
28/* 29/*
@@ -69,7 +70,7 @@
69 * 70 *
70 * pa_lstart -> the logical start block for this prealloc space 71 * pa_lstart -> the logical start block for this prealloc space
71 * pa_pstart -> the physical start block for this prealloc space 72 * pa_pstart -> the physical start block for this prealloc space
72 * pa_len -> lenght for this prealloc space 73 * pa_len -> length for this prealloc space
73 * pa_free -> free space available in this prealloc space 74 * pa_free -> free space available in this prealloc space
74 * 75 *
75 * The inode preallocation space is used looking at the _logical_ start 76 * The inode preallocation space is used looking at the _logical_ start
@@ -441,10 +442,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
441 for (i = 0; i < count; i++) { 442 for (i = 0; i < count; i++) {
442 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { 443 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
443 ext4_fsblk_t blocknr; 444 ext4_fsblk_t blocknr;
444 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); 445
446 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
445 blocknr += first + i; 447 blocknr += first + i;
446 blocknr +=
447 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
448 ext4_grp_locked_error(sb, e4b->bd_group, 448 ext4_grp_locked_error(sb, e4b->bd_group,
449 __func__, "double-free of inode" 449 __func__, "double-free of inode"
450 " %lu's block %llu(bit %u in group %u)", 450 " %lu's block %llu(bit %u in group %u)",
@@ -1255,10 +1255,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1255 1255
1256 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { 1256 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
1257 ext4_fsblk_t blocknr; 1257 ext4_fsblk_t blocknr;
1258 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); 1258
1259 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1259 blocknr += block; 1260 blocknr += block;
1260 blocknr +=
1261 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
1262 ext4_grp_locked_error(sb, e4b->bd_group, 1261 ext4_grp_locked_error(sb, e4b->bd_group,
1263 __func__, "double-free of inode" 1262 __func__, "double-free of inode"
1264 " %lu's block %llu(bit %u in group %u)", 1263 " %lu's block %llu(bit %u in group %u)",
@@ -1631,7 +1630,6 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1631 int max; 1630 int max;
1632 int err; 1631 int err;
1633 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1632 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1634 struct ext4_super_block *es = sbi->s_es;
1635 struct ext4_free_extent ex; 1633 struct ext4_free_extent ex;
1636 1634
1637 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) 1635 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
@@ -1648,8 +1646,8 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1648 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 1646 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1649 ext4_fsblk_t start; 1647 ext4_fsblk_t start;
1650 1648
1651 start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + 1649 start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
1652 ex.fe_start + le32_to_cpu(es->s_first_data_block); 1650 ex.fe_start;
1653 /* use do_div to get remainder (would be 64-bit modulo) */ 1651 /* use do_div to get remainder (would be 64-bit modulo) */
1654 if (do_div(start, sbi->s_stripe) == 0) { 1652 if (do_div(start, sbi->s_stripe) == 0) {
1655 ac->ac_found++; 1653 ac->ac_found++;
@@ -1803,8 +1801,8 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1803 BUG_ON(sbi->s_stripe == 0); 1801 BUG_ON(sbi->s_stripe == 0);
1804 1802
1805 /* find first stripe-aligned block in group */ 1803 /* find first stripe-aligned block in group */
1806 first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb) 1804 first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
1807 + le32_to_cpu(sbi->s_es->s_first_data_block); 1805
1808 a = first_group_block + sbi->s_stripe - 1; 1806 a = first_group_block + sbi->s_stripe - 1;
1809 do_div(a, sbi->s_stripe); 1807 do_div(a, sbi->s_stripe);
1810 i = (a * sbi->s_stripe) - first_group_block; 1808 i = (a * sbi->s_stripe) - first_group_block;
@@ -2256,7 +2254,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2256 2254
2257 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2255 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2258 init_rwsem(&meta_group_info[i]->alloc_sem); 2256 init_rwsem(&meta_group_info[i]->alloc_sem);
2259 meta_group_info[i]->bb_free_root.rb_node = NULL; 2257 meta_group_info[i]->bb_free_root = RB_ROOT;
2260 2258
2261#ifdef DOUBLE_CHECK 2259#ifdef DOUBLE_CHECK
2262 { 2260 {
@@ -2537,6 +2535,17 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2537 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2535 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2538 entry->count, entry->group, entry); 2536 entry->count, entry->group, entry);
2539 2537
2538 if (test_opt(sb, DISCARD)) {
2539 ext4_fsblk_t discard_block;
2540
2541 discard_block = entry->start_blk +
2542 ext4_group_first_block_no(sb, entry->group);
2543 trace_ext4_discard_blocks(sb,
2544 (unsigned long long)discard_block,
2545 entry->count);
2546 sb_issue_discard(sb, discard_block, entry->count);
2547 }
2548
2540 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2549 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2541 /* we expect to find existing buddy because it's pinned */ 2550 /* we expect to find existing buddy because it's pinned */
2542 BUG_ON(err != 0); 2551 BUG_ON(err != 0);
@@ -2558,19 +2567,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2558 page_cache_release(e4b.bd_bitmap_page); 2567 page_cache_release(e4b.bd_bitmap_page);
2559 } 2568 }
2560 ext4_unlock_group(sb, entry->group); 2569 ext4_unlock_group(sb, entry->group);
2561 if (test_opt(sb, DISCARD)) {
2562 ext4_fsblk_t discard_block;
2563 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
2564
2565 discard_block = (ext4_fsblk_t)entry->group *
2566 EXT4_BLOCKS_PER_GROUP(sb)
2567 + entry->start_blk
2568 + le32_to_cpu(es->s_first_data_block);
2569 trace_ext4_discard_blocks(sb,
2570 (unsigned long long)discard_block,
2571 entry->count);
2572 sb_issue_discard(sb, discard_block, entry->count);
2573 }
2574 kmem_cache_free(ext4_free_ext_cachep, entry); 2570 kmem_cache_free(ext4_free_ext_cachep, entry);
2575 ext4_mb_release_desc(&e4b); 2571 ext4_mb_release_desc(&e4b);
2576 } 2572 }
@@ -2703,14 +2699,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2703 if (err) 2699 if (err)
2704 goto out_err; 2700 goto out_err;
2705 2701
2706 block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb) 2702 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
2707 + ac->ac_b_ex.fe_start
2708 + le32_to_cpu(es->s_first_data_block);
2709 2703
2710 len = ac->ac_b_ex.fe_len; 2704 len = ac->ac_b_ex.fe_len;
2711 if (!ext4_data_block_valid(sbi, block, len)) { 2705 if (!ext4_data_block_valid(sbi, block, len)) {
2712 ext4_error(sb, __func__, 2706 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2713 "Allocating blocks %llu-%llu which overlap "
2714 "fs metadata\n", block, block+len); 2707 "fs metadata\n", block, block+len);
2715 /* File system mounted not to panic on error 2708 /* File system mounted not to panic on error
2716 * Fix the bitmap and repeat the block allocation 2709 * Fix the bitmap and repeat the block allocation
@@ -3161,9 +3154,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3161 /* The max size of hash table is PREALLOC_TB_SIZE */ 3154 /* The max size of hash table is PREALLOC_TB_SIZE */
3162 order = PREALLOC_TB_SIZE - 1; 3155 order = PREALLOC_TB_SIZE - 1;
3163 3156
3164 goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + 3157 goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
3165 ac->ac_g_ex.fe_start +
3166 le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
3167 /* 3158 /*
3168 * search for the prealloc space that is having 3159 * search for the prealloc space that is having
3169 * minimal distance from the goal block. 3160 * minimal distance from the goal block.
@@ -3526,8 +3517,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3526 if (bit >= end) 3517 if (bit >= end)
3527 break; 3518 break;
3528 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3519 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3529 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + 3520 start = ext4_group_first_block_no(sb, group) + bit;
3530 le32_to_cpu(sbi->s_es->s_first_data_block);
3531 mb_debug(1, " free preallocated %u/%u in group %u\n", 3521 mb_debug(1, " free preallocated %u/%u in group %u\n",
3532 (unsigned) start, (unsigned) next - bit, 3522 (unsigned) start, (unsigned) next - bit,
3533 (unsigned) group); 3523 (unsigned) group);
@@ -3623,15 +3613,13 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3623 3613
3624 bitmap_bh = ext4_read_block_bitmap(sb, group); 3614 bitmap_bh = ext4_read_block_bitmap(sb, group);
3625 if (bitmap_bh == NULL) { 3615 if (bitmap_bh == NULL) {
3626 ext4_error(sb, __func__, "Error in reading block " 3616 ext4_error(sb, "Error reading block bitmap for %u", group);
3627 "bitmap for %u", group);
3628 return 0; 3617 return 0;
3629 } 3618 }
3630 3619
3631 err = ext4_mb_load_buddy(sb, group, &e4b); 3620 err = ext4_mb_load_buddy(sb, group, &e4b);
3632 if (err) { 3621 if (err) {
3633 ext4_error(sb, __func__, "Error in loading buddy " 3622 ext4_error(sb, "Error loading buddy information for %u", group);
3634 "information for %u", group);
3635 put_bh(bitmap_bh); 3623 put_bh(bitmap_bh);
3636 return 0; 3624 return 0;
3637 } 3625 }
@@ -3804,15 +3792,15 @@ repeat:
3804 3792
3805 err = ext4_mb_load_buddy(sb, group, &e4b); 3793 err = ext4_mb_load_buddy(sb, group, &e4b);
3806 if (err) { 3794 if (err) {
3807 ext4_error(sb, __func__, "Error in loading buddy " 3795 ext4_error(sb, "Error loading buddy information for %u",
3808 "information for %u", group); 3796 group);
3809 continue; 3797 continue;
3810 } 3798 }
3811 3799
3812 bitmap_bh = ext4_read_block_bitmap(sb, group); 3800 bitmap_bh = ext4_read_block_bitmap(sb, group);
3813 if (bitmap_bh == NULL) { 3801 if (bitmap_bh == NULL) {
3814 ext4_error(sb, __func__, "Error in reading block " 3802 ext4_error(sb, "Error reading block bitmap for %u",
3815 "bitmap for %u", group); 3803 group);
3816 ext4_mb_release_desc(&e4b); 3804 ext4_mb_release_desc(&e4b);
3817 continue; 3805 continue;
3818 } 3806 }
@@ -3938,7 +3926,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
3938 3926
3939 /* don't use group allocation for large files */ 3927 /* don't use group allocation for large files */
3940 size = max(size, isize); 3928 size = max(size, isize);
3941 if (size >= sbi->s_mb_stream_request) { 3929 if (size > sbi->s_mb_stream_request) {
3942 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 3930 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
3943 return; 3931 return;
3944 } 3932 }
@@ -4077,8 +4065,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4077 4065
4078 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4066 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4079 if (ext4_mb_load_buddy(sb, group, &e4b)) { 4067 if (ext4_mb_load_buddy(sb, group, &e4b)) {
4080 ext4_error(sb, __func__, "Error in loading buddy " 4068 ext4_error(sb, "Error loading buddy information for %u",
4081 "information for %u", group); 4069 group);
4082 continue; 4070 continue;
4083 } 4071 }
4084 ext4_lock_group(sb, group); 4072 ext4_lock_group(sb, group);
@@ -4254,7 +4242,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4254 return 0; 4242 return 0;
4255 } 4243 }
4256 reserv_blks = ar->len; 4244 reserv_blks = ar->len;
4257 while (ar->len && vfs_dq_alloc_block(ar->inode, ar->len)) { 4245 while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
4258 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4246 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4259 ar->len--; 4247 ar->len--;
4260 } 4248 }
@@ -4331,7 +4319,7 @@ out2:
4331 kmem_cache_free(ext4_ac_cachep, ac); 4319 kmem_cache_free(ext4_ac_cachep, ac);
4332out1: 4320out1:
4333 if (inquota && ar->len < inquota) 4321 if (inquota && ar->len < inquota)
4334 vfs_dq_free_block(ar->inode, inquota - ar->len); 4322 dquot_free_block(ar->inode, inquota - ar->len);
4335out3: 4323out3:
4336 if (!ar->len) { 4324 if (!ar->len) {
4337 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4325 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
@@ -4476,10 +4464,10 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4476 4464
4477 sbi = EXT4_SB(sb); 4465 sbi = EXT4_SB(sb);
4478 es = EXT4_SB(sb)->s_es; 4466 es = EXT4_SB(sb)->s_es;
4479 if (!ext4_data_block_valid(sbi, block, count)) { 4467 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4480 ext4_error(sb, __func__, 4468 !ext4_data_block_valid(sbi, block, count)) {
4481 "Freeing blocks not in datazone - " 4469 ext4_error(sb, "Freeing blocks not in datazone - "
4482 "block = %llu, count = %lu", block, count); 4470 "block = %llu, count = %lu", block, count);
4483 goto error_return; 4471 goto error_return;
4484 } 4472 }
4485 4473
@@ -4547,8 +4535,7 @@ do_more:
4547 in_range(block + count - 1, ext4_inode_table(sb, gdp), 4535 in_range(block + count - 1, ext4_inode_table(sb, gdp),
4548 EXT4_SB(sb)->s_itb_per_group)) { 4536 EXT4_SB(sb)->s_itb_per_group)) {
4549 4537
4550 ext4_error(sb, __func__, 4538 ext4_error(sb, "Freeing blocks in system zone - "
4551 "Freeing blocks in system zone - "
4552 "Block = %llu, count = %lu", block, count); 4539 "Block = %llu, count = %lu", block, count);
4553 /* err = 0. ext4_std_error should be a no op */ 4540 /* err = 0. ext4_std_error should be a no op */
4554 goto error_return; 4541 goto error_return;
@@ -4646,7 +4633,7 @@ do_more:
4646 sb->s_dirt = 1; 4633 sb->s_dirt = 1;
4647error_return: 4634error_return:
4648 if (freed) 4635 if (freed)
4649 vfs_dq_free_block(inode, freed); 4636 dquot_free_block(inode, freed);
4650 brelse(bitmap_bh); 4637 brelse(bitmap_bh);
4651 ext4_std_error(sb, err); 4638 ext4_std_error(sb, err);
4652 if (ac) 4639 if (ac)
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 0ca811061bc7..b619322c76f0 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -17,7 +17,6 @@
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
18#include <linux/pagemap.h> 18#include <linux/pagemap.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/version.h>
21#include <linux/blkdev.h> 20#include <linux/blkdev.h>
22#include <linux/mutex.h> 21#include <linux/mutex.h>
23#include "ext4_jbd2.h" 22#include "ext4_jbd2.h"
@@ -221,16 +220,9 @@ struct ext4_buddy {
221#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 220#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
222#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 221#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
223 222
224#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
225
226static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 223static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
227 struct ext4_free_extent *fex) 224 struct ext4_free_extent *fex)
228{ 225{
229 ext4_fsblk_t block; 226 return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start;
230
231 block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
232 + fex->fe_start
233 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
234 return block;
235} 227}
236#endif 228#endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 81415814b00b..34dcfc52ef44 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -13,6 +13,7 @@
13 */ 13 */
14 14
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
16#include "ext4_jbd2.h" 17#include "ext4_jbd2.h"
17#include "ext4_extents.h" 18#include "ext4_extents.h"
18 19
@@ -365,12 +366,12 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
365 * happened after we started the migrate. We need to 366 * happened after we started the migrate. We need to
366 * fail the migrate 367 * fail the migrate
367 */ 368 */
368 if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) { 369 if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
369 retval = -EAGAIN; 370 retval = -EAGAIN;
370 up_write(&EXT4_I(inode)->i_data_sem); 371 up_write(&EXT4_I(inode)->i_data_sem);
371 goto err_out; 372 goto err_out;
372 } else 373 } else
373 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; 374 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
374 /* 375 /*
375 * We have the extent map build with the tmp inode. 376 * We have the extent map build with the tmp inode.
376 * Now copy the i_data across 377 * Now copy the i_data across
@@ -503,14 +504,10 @@ int ext4_ext_migrate(struct inode *inode)
503 } 504 }
504 i_size_write(tmp_inode, i_size_read(inode)); 505 i_size_write(tmp_inode, i_size_read(inode));
505 /* 506 /*
506 * We don't want the inode to be reclaimed 507 * Set the i_nlink to zero so it will be deleted later
507 * if we got interrupted in between. We have 508 * when we drop inode reference.
508 * this tmp inode carrying reference to the
509 * data blocks of the original file. We set
510 * the i_nlink to zero at the last stage after
511 * switching the original file to extent format
512 */ 509 */
513 tmp_inode->i_nlink = 1; 510 tmp_inode->i_nlink = 0;
514 511
515 ext4_ext_tree_init(handle, tmp_inode); 512 ext4_ext_tree_init(handle, tmp_inode);
516 ext4_orphan_add(handle, tmp_inode); 513 ext4_orphan_add(handle, tmp_inode);
@@ -533,10 +530,20 @@ int ext4_ext_migrate(struct inode *inode)
533 * allocation. 530 * allocation.
534 */ 531 */
535 down_read((&EXT4_I(inode)->i_data_sem)); 532 down_read((&EXT4_I(inode)->i_data_sem));
536 EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE; 533 ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
537 up_read((&EXT4_I(inode)->i_data_sem)); 534 up_read((&EXT4_I(inode)->i_data_sem));
538 535
539 handle = ext4_journal_start(inode, 1); 536 handle = ext4_journal_start(inode, 1);
537 if (IS_ERR(handle)) {
538 /*
539 * It is impossible to update on-disk structures without
540 * a handle, so just rollback in-core changes and live other
541 * work to orphan_list_cleanup()
542 */
543 ext4_orphan_del(NULL, tmp_inode);
544 retval = PTR_ERR(handle);
545 goto out;
546 }
540 547
541 ei = EXT4_I(inode); 548 ei = EXT4_I(inode);
542 i_data = ei->i_data; 549 i_data = ei->i_data;
@@ -618,15 +625,8 @@ err_out:
618 625
619 /* Reset the extent details */ 626 /* Reset the extent details */
620 ext4_ext_tree_init(handle, tmp_inode); 627 ext4_ext_tree_init(handle, tmp_inode);
621
622 /*
623 * Set the i_nlink to zero so that
624 * generic_drop_inode really deletes the
625 * inode
626 */
627 tmp_inode->i_nlink = 0;
628
629 ext4_journal_stop(handle); 628 ext4_journal_stop(handle);
629out:
630 unlock_new_inode(tmp_inode); 630 unlock_new_inode(tmp_inode);
631 iput(tmp_inode); 631 iput(tmp_inode);
632 632
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 82c415be87a4..d1fc662cc311 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -15,6 +15,7 @@
15 15
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/slab.h>
18#include "ext4_jbd2.h" 19#include "ext4_jbd2.h"
19#include "ext4_extents.h" 20#include "ext4_extents.h"
20#include "ext4.h" 21#include "ext4.h"
@@ -152,12 +153,12 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
152 int ret = 0; 153 int ret = 0;
153 154
154 if (inode1 == NULL) { 155 if (inode1 == NULL) {
155 ext4_error(inode2->i_sb, function, 156 __ext4_error(inode2->i_sb, function,
156 "Both inodes should not be NULL: " 157 "Both inodes should not be NULL: "
157 "inode1 NULL inode2 %lu", inode2->i_ino); 158 "inode1 NULL inode2 %lu", inode2->i_ino);
158 ret = -EIO; 159 ret = -EIO;
159 } else if (inode2 == NULL) { 160 } else if (inode2 == NULL) {
160 ext4_error(inode1->i_sb, function, 161 __ext4_error(inode1->i_sb, function,
161 "Both inodes should not be NULL: " 162 "Both inodes should not be NULL: "
162 "inode1 %lu inode2 NULL", inode1->i_ino); 163 "inode1 %lu inode2 NULL", inode1->i_ino);
163 ret = -EIO; 164 ret = -EIO;
@@ -252,6 +253,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
252 } 253 }
253 254
254 o_start->ee_len = start_ext->ee_len; 255 o_start->ee_len = start_ext->ee_len;
256 eblock = le32_to_cpu(start_ext->ee_block);
255 new_flag = 1; 257 new_flag = 1;
256 258
257 } else if (start_ext->ee_len && new_ext->ee_len && 259 } else if (start_ext->ee_len && new_ext->ee_len &&
@@ -262,6 +264,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
262 * orig |------------------------------| 264 * orig |------------------------------|
263 */ 265 */
264 o_start->ee_len = start_ext->ee_len; 266 o_start->ee_len = start_ext->ee_len;
267 eblock = le32_to_cpu(start_ext->ee_block);
265 new_flag = 1; 268 new_flag = 1;
266 269
267 } else if (!start_ext->ee_len && new_ext->ee_len && 270 } else if (!start_ext->ee_len && new_ext->ee_len &&
@@ -475,7 +478,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
475 struct ext4_extent *oext, *o_start, *o_end, *prev_ext; 478 struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
476 struct ext4_extent new_ext, start_ext, end_ext; 479 struct ext4_extent new_ext, start_ext, end_ext;
477 ext4_lblk_t new_ext_end; 480 ext4_lblk_t new_ext_end;
478 ext4_fsblk_t new_phys_end;
479 int oext_alen, new_ext_alen, end_ext_alen; 481 int oext_alen, new_ext_alen, end_ext_alen;
480 int depth = ext_depth(orig_inode); 482 int depth = ext_depth(orig_inode);
481 int ret; 483 int ret;
@@ -489,7 +491,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
489 new_ext.ee_len = dext->ee_len; 491 new_ext.ee_len = dext->ee_len;
490 new_ext_alen = ext4_ext_get_actual_len(&new_ext); 492 new_ext_alen = ext4_ext_get_actual_len(&new_ext);
491 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; 493 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
492 new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
493 494
494 /* 495 /*
495 * Case: original extent is first 496 * Case: original extent is first
@@ -502,6 +503,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
502 le32_to_cpu(oext->ee_block) + oext_alen) { 503 le32_to_cpu(oext->ee_block) + oext_alen) {
503 start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - 504 start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
504 le32_to_cpu(oext->ee_block)); 505 le32_to_cpu(oext->ee_block));
506 start_ext.ee_block = oext->ee_block;
505 copy_extent_status(oext, &start_ext); 507 copy_extent_status(oext, &start_ext);
506 } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { 508 } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
507 prev_ext = oext - 1; 509 prev_ext = oext - 1;
@@ -515,6 +517,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
515 start_ext.ee_len = cpu_to_le16( 517 start_ext.ee_len = cpu_to_le16(
516 ext4_ext_get_actual_len(prev_ext) + 518 ext4_ext_get_actual_len(prev_ext) +
517 new_ext_alen); 519 new_ext_alen);
520 start_ext.ee_block = oext->ee_block;
518 copy_extent_status(prev_ext, &start_ext); 521 copy_extent_status(prev_ext, &start_ext);
519 new_ext.ee_len = 0; 522 new_ext.ee_len = 0;
520 } 523 }
@@ -526,7 +529,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
526 * new_ext |-------| 529 * new_ext |-------|
527 */ 530 */
528 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { 531 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
529 ext4_error(orig_inode->i_sb, __func__, 532 ext4_error(orig_inode->i_sb,
530 "new_ext_end(%u) should be less than or equal to " 533 "new_ext_end(%u) should be less than or equal to "
531 "oext->ee_block(%u) + oext_alen(%d) - 1", 534 "oext->ee_block(%u) + oext_alen(%d) - 1",
532 new_ext_end, le32_to_cpu(oext->ee_block), 535 new_ext_end, le32_to_cpu(oext->ee_block),
@@ -689,12 +692,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
689 while (1) { 692 while (1) {
690 /* The extent for donor must be found. */ 693 /* The extent for donor must be found. */
691 if (!dext) { 694 if (!dext) {
692 ext4_error(donor_inode->i_sb, __func__, 695 ext4_error(donor_inode->i_sb,
693 "The extent for donor must be found"); 696 "The extent for donor must be found");
694 *err = -EIO; 697 *err = -EIO;
695 goto out; 698 goto out;
696 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { 699 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
697 ext4_error(donor_inode->i_sb, __func__, 700 ext4_error(donor_inode->i_sb,
698 "Donor offset(%u) and the first block of donor " 701 "Donor offset(%u) and the first block of donor "
699 "extent(%u) should be equal", 702 "extent(%u) should be equal",
700 donor_off, 703 donor_off,
@@ -928,7 +931,7 @@ out2:
928} 931}
929 932
930/** 933/**
931 * mext_check_argumants - Check whether move extent can be done 934 * mext_check_arguments - Check whether move extent can be done
932 * 935 *
933 * @orig_inode: original inode 936 * @orig_inode: original inode
934 * @donor_inode: donor inode 937 * @donor_inode: donor inode
@@ -949,14 +952,6 @@ mext_check_arguments(struct inode *orig_inode,
949 unsigned int blkbits = orig_inode->i_blkbits; 952 unsigned int blkbits = orig_inode->i_blkbits;
950 unsigned int blocksize = 1 << blkbits; 953 unsigned int blocksize = 1 << blkbits;
951 954
952 /* Regular file check */
953 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
954 ext4_debug("ext4 move extent: The argument files should be "
955 "regular file [ino:orig %lu, donor %lu]\n",
956 orig_inode->i_ino, donor_inode->i_ino);
957 return -EINVAL;
958 }
959
960 if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { 955 if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
961 ext4_debug("ext4 move extent: suid or sgid is set" 956 ext4_debug("ext4 move extent: suid or sgid is set"
962 " to donor file [ino:orig %lu, donor %lu]\n", 957 " to donor file [ino:orig %lu, donor %lu]\n",
@@ -1204,6 +1199,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1204 return -EINVAL; 1199 return -EINVAL;
1205 } 1200 }
1206 1201
1202 /* Regular file check */
1203 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
1204 ext4_debug("ext4 move extent: The argument files should be "
1205 "regular file [ino:orig %lu, donor %lu]\n",
1206 orig_inode->i_ino, donor_inode->i_ino);
1207 return -EINVAL;
1208 }
1209
1207 /* Protect orig and donor inodes against a truncate */ 1210 /* Protect orig and donor inodes against a truncate */
1208 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1211 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1209 if (ret1 < 0) 1212 if (ret1 < 0)
@@ -1351,7 +1354,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1351 if (ret1 < 0) 1354 if (ret1 < 0)
1352 break; 1355 break;
1353 if (*moved_len > len) { 1356 if (*moved_len > len) {
1354 ext4_error(orig_inode->i_sb, __func__, 1357 ext4_error(orig_inode->i_sb,
1355 "We replaced blocks too much! " 1358 "We replaced blocks too much! "
1356 "sum of replaced: %llu requested: %llu", 1359 "sum of replaced: %llu requested: %llu",
1357 *moved_len, len); 1360 *moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 17a17e10dd60..0c070fabd108 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -383,8 +383,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
383 if (root->info.hash_version != DX_HASH_TEA && 383 if (root->info.hash_version != DX_HASH_TEA &&
384 root->info.hash_version != DX_HASH_HALF_MD4 && 384 root->info.hash_version != DX_HASH_HALF_MD4 &&
385 root->info.hash_version != DX_HASH_LEGACY) { 385 root->info.hash_version != DX_HASH_LEGACY) {
386 ext4_warning(dir->i_sb, __func__, 386 ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
387 "Unrecognised inode hash code %d",
388 root->info.hash_version); 387 root->info.hash_version);
389 brelse(bh); 388 brelse(bh);
390 *err = ERR_BAD_DX_DIR; 389 *err = ERR_BAD_DX_DIR;
@@ -399,8 +398,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
399 hash = hinfo->hash; 398 hash = hinfo->hash;
400 399
401 if (root->info.unused_flags & 1) { 400 if (root->info.unused_flags & 1) {
402 ext4_warning(dir->i_sb, __func__, 401 ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
403 "Unimplemented inode hash flags: %#06x",
404 root->info.unused_flags); 402 root->info.unused_flags);
405 brelse(bh); 403 brelse(bh);
406 *err = ERR_BAD_DX_DIR; 404 *err = ERR_BAD_DX_DIR;
@@ -408,8 +406,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
408 } 406 }
409 407
410 if ((indirect = root->info.indirect_levels) > 1) { 408 if ((indirect = root->info.indirect_levels) > 1) {
411 ext4_warning(dir->i_sb, __func__, 409 ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
412 "Unimplemented inode hash depth: %#06x",
413 root->info.indirect_levels); 410 root->info.indirect_levels);
414 brelse(bh); 411 brelse(bh);
415 *err = ERR_BAD_DX_DIR; 412 *err = ERR_BAD_DX_DIR;
@@ -421,8 +418,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
421 418
422 if (dx_get_limit(entries) != dx_root_limit(dir, 419 if (dx_get_limit(entries) != dx_root_limit(dir,
423 root->info.info_length)) { 420 root->info.info_length)) {
424 ext4_warning(dir->i_sb, __func__, 421 ext4_warning(dir->i_sb, "dx entry: limit != root limit");
425 "dx entry: limit != root limit");
426 brelse(bh); 422 brelse(bh);
427 *err = ERR_BAD_DX_DIR; 423 *err = ERR_BAD_DX_DIR;
428 goto fail; 424 goto fail;
@@ -433,7 +429,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
433 { 429 {
434 count = dx_get_count(entries); 430 count = dx_get_count(entries);
435 if (!count || count > dx_get_limit(entries)) { 431 if (!count || count > dx_get_limit(entries)) {
436 ext4_warning(dir->i_sb, __func__, 432 ext4_warning(dir->i_sb,
437 "dx entry: no count or count > limit"); 433 "dx entry: no count or count > limit");
438 brelse(bh); 434 brelse(bh);
439 *err = ERR_BAD_DX_DIR; 435 *err = ERR_BAD_DX_DIR;
@@ -478,7 +474,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
478 goto fail2; 474 goto fail2;
479 at = entries = ((struct dx_node *) bh->b_data)->entries; 475 at = entries = ((struct dx_node *) bh->b_data)->entries;
480 if (dx_get_limit(entries) != dx_node_limit (dir)) { 476 if (dx_get_limit(entries) != dx_node_limit (dir)) {
481 ext4_warning(dir->i_sb, __func__, 477 ext4_warning(dir->i_sb,
482 "dx entry: limit != node limit"); 478 "dx entry: limit != node limit");
483 brelse(bh); 479 brelse(bh);
484 *err = ERR_BAD_DX_DIR; 480 *err = ERR_BAD_DX_DIR;
@@ -494,7 +490,7 @@ fail2:
494 } 490 }
495fail: 491fail:
496 if (*err == ERR_BAD_DX_DIR) 492 if (*err == ERR_BAD_DX_DIR)
497 ext4_warning(dir->i_sb, __func__, 493 ext4_warning(dir->i_sb,
498 "Corrupt dir inode %ld, running e2fsck is " 494 "Corrupt dir inode %ld, running e2fsck is "
499 "recommended.", dir->i_ino); 495 "recommended.", dir->i_ino);
500 return NULL; 496 return NULL;
@@ -947,9 +943,8 @@ restart:
947 wait_on_buffer(bh); 943 wait_on_buffer(bh);
948 if (!buffer_uptodate(bh)) { 944 if (!buffer_uptodate(bh)) {
949 /* read error, skip block & hope for the best */ 945 /* read error, skip block & hope for the best */
950 ext4_error(sb, __func__, "reading directory #%lu " 946 ext4_error(sb, "reading directory #%lu offset %lu",
951 "offset %lu", dir->i_ino, 947 dir->i_ino, (unsigned long)block);
952 (unsigned long)block);
953 brelse(bh); 948 brelse(bh);
954 goto next; 949 goto next;
955 } 950 }
@@ -1041,7 +1036,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
1041 retval = ext4_htree_next_block(dir, hash, frame, 1036 retval = ext4_htree_next_block(dir, hash, frame,
1042 frames, NULL); 1037 frames, NULL);
1043 if (retval < 0) { 1038 if (retval < 0) {
1044 ext4_warning(sb, __func__, 1039 ext4_warning(sb,
1045 "error reading index page in directory #%lu", 1040 "error reading index page in directory #%lu",
1046 dir->i_ino); 1041 dir->i_ino);
1047 *err = retval; 1042 *err = retval;
@@ -1071,14 +1066,13 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1071 __u32 ino = le32_to_cpu(de->inode); 1066 __u32 ino = le32_to_cpu(de->inode);
1072 brelse(bh); 1067 brelse(bh);
1073 if (!ext4_valid_inum(dir->i_sb, ino)) { 1068 if (!ext4_valid_inum(dir->i_sb, ino)) {
1074 ext4_error(dir->i_sb, "ext4_lookup", 1069 ext4_error(dir->i_sb, "bad inode number: %u", ino);
1075 "bad inode number: %u", ino);
1076 return ERR_PTR(-EIO); 1070 return ERR_PTR(-EIO);
1077 } 1071 }
1078 inode = ext4_iget(dir->i_sb, ino); 1072 inode = ext4_iget(dir->i_sb, ino);
1079 if (unlikely(IS_ERR(inode))) { 1073 if (unlikely(IS_ERR(inode))) {
1080 if (PTR_ERR(inode) == -ESTALE) { 1074 if (PTR_ERR(inode) == -ESTALE) {
1081 ext4_error(dir->i_sb, __func__, 1075 ext4_error(dir->i_sb,
1082 "deleted inode referenced: %u", 1076 "deleted inode referenced: %u",
1083 ino); 1077 ino);
1084 return ERR_PTR(-EIO); 1078 return ERR_PTR(-EIO);
@@ -1110,7 +1104,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
1110 brelse(bh); 1104 brelse(bh);
1111 1105
1112 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { 1106 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1113 ext4_error(child->d_inode->i_sb, "ext4_get_parent", 1107 ext4_error(child->d_inode->i_sb,
1114 "bad inode number: %u", ino); 1108 "bad inode number: %u", ino);
1115 return ERR_PTR(-EIO); 1109 return ERR_PTR(-EIO);
1116 } 1110 }
@@ -1410,7 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1410 de = (struct ext4_dir_entry_2 *)((char *)fde + 1404 de = (struct ext4_dir_entry_2 *)((char *)fde +
1411 ext4_rec_len_from_disk(fde->rec_len, blocksize)); 1405 ext4_rec_len_from_disk(fde->rec_len, blocksize));
1412 if ((char *) de >= (((char *) root) + blocksize)) { 1406 if ((char *) de >= (((char *) root) + blocksize)) {
1413 ext4_error(dir->i_sb, __func__, 1407 ext4_error(dir->i_sb,
1414 "invalid rec_len for '..' in inode %lu", 1408 "invalid rec_len for '..' in inode %lu",
1415 dir->i_ino); 1409 dir->i_ino);
1416 brelse(bh); 1410 brelse(bh);
@@ -1575,8 +1569,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1575 1569
1576 if (levels && (dx_get_count(frames->entries) == 1570 if (levels && (dx_get_count(frames->entries) ==
1577 dx_get_limit(frames->entries))) { 1571 dx_get_limit(frames->entries))) {
1578 ext4_warning(sb, __func__, 1572 ext4_warning(sb, "Directory index full!");
1579 "Directory index full!");
1580 err = -ENOSPC; 1573 err = -ENOSPC;
1581 goto cleanup; 1574 goto cleanup;
1582 } 1575 }
@@ -1766,6 +1759,8 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
1766 struct inode *inode; 1759 struct inode *inode;
1767 int err, retries = 0; 1760 int err, retries = 0;
1768 1761
1762 dquot_initialize(dir);
1763
1769retry: 1764retry:
1770 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1765 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1771 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1766 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1800,6 +1795,8 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
1800 if (!new_valid_dev(rdev)) 1795 if (!new_valid_dev(rdev))
1801 return -EINVAL; 1796 return -EINVAL;
1802 1797
1798 dquot_initialize(dir);
1799
1803retry: 1800retry:
1804 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1801 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1805 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1802 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1837,6 +1834,8 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1837 if (EXT4_DIR_LINK_MAX(dir)) 1834 if (EXT4_DIR_LINK_MAX(dir))
1838 return -EMLINK; 1835 return -EMLINK;
1839 1836
1837 dquot_initialize(dir);
1838
1840retry: 1839retry:
1841 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1840 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1842 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1841 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -1916,11 +1915,11 @@ static int empty_dir(struct inode *inode)
1916 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || 1915 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1917 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { 1916 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
1918 if (err) 1917 if (err)
1919 ext4_error(inode->i_sb, __func__, 1918 ext4_error(inode->i_sb,
1920 "error %d reading directory #%lu offset 0", 1919 "error %d reading directory #%lu offset 0",
1921 err, inode->i_ino); 1920 err, inode->i_ino);
1922 else 1921 else
1923 ext4_warning(inode->i_sb, __func__, 1922 ext4_warning(inode->i_sb,
1924 "bad directory (dir #%lu) - no data block", 1923 "bad directory (dir #%lu) - no data block",
1925 inode->i_ino); 1924 inode->i_ino);
1926 return 1; 1925 return 1;
@@ -1931,7 +1930,7 @@ static int empty_dir(struct inode *inode)
1931 !le32_to_cpu(de1->inode) || 1930 !le32_to_cpu(de1->inode) ||
1932 strcmp(".", de->name) || 1931 strcmp(".", de->name) ||
1933 strcmp("..", de1->name)) { 1932 strcmp("..", de1->name)) {
1934 ext4_warning(inode->i_sb, "empty_dir", 1933 ext4_warning(inode->i_sb,
1935 "bad directory (dir #%lu) - no `.' or `..'", 1934 "bad directory (dir #%lu) - no `.' or `..'",
1936 inode->i_ino); 1935 inode->i_ino);
1937 brelse(bh); 1936 brelse(bh);
@@ -1949,7 +1948,7 @@ static int empty_dir(struct inode *inode)
1949 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); 1948 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
1950 if (!bh) { 1949 if (!bh) {
1951 if (err) 1950 if (err)
1952 ext4_error(sb, __func__, 1951 ext4_error(sb,
1953 "error %d reading directory" 1952 "error %d reading directory"
1954 " #%lu offset %u", 1953 " #%lu offset %u",
1955 err, inode->i_ino, offset); 1954 err, inode->i_ino, offset);
@@ -2020,11 +2019,18 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2020 err = ext4_reserve_inode_write(handle, inode, &iloc); 2019 err = ext4_reserve_inode_write(handle, inode, &iloc);
2021 if (err) 2020 if (err)
2022 goto out_unlock; 2021 goto out_unlock;
2022 /*
2023 * Due to previous errors inode may be already a part of on-disk
2024 * orphan list. If so skip on-disk list modification.
2025 */
2026 if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
2027 (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
2028 goto mem_insert;
2023 2029
2024 /* Insert this inode at the head of the on-disk orphan list... */ 2030 /* Insert this inode at the head of the on-disk orphan list... */
2025 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); 2031 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
2026 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); 2032 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
2027 err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh); 2033 err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
2028 rc = ext4_mark_iloc_dirty(handle, inode, &iloc); 2034 rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
2029 if (!err) 2035 if (!err)
2030 err = rc; 2036 err = rc;
@@ -2037,6 +2043,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2037 * 2043 *
2038 * This is safe: on error we're going to ignore the orphan list 2044 * This is safe: on error we're going to ignore the orphan list
2039 * anyway on the next recovery. */ 2045 * anyway on the next recovery. */
2046mem_insert:
2040 if (!err) 2047 if (!err)
2041 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 2048 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2042 2049
@@ -2096,7 +2103,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2096 if (err) 2103 if (err)
2097 goto out_brelse; 2104 goto out_brelse;
2098 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); 2105 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
2099 err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh); 2106 err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
2100 } else { 2107 } else {
2101 struct ext4_iloc iloc2; 2108 struct ext4_iloc iloc2;
2102 struct inode *i_prev = 2109 struct inode *i_prev =
@@ -2136,7 +2143,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2136 2143
2137 /* Initialize quotas before so that eventual writes go in 2144 /* Initialize quotas before so that eventual writes go in
2138 * separate transaction */ 2145 * separate transaction */
2139 vfs_dq_init(dentry->d_inode); 2146 dquot_initialize(dir);
2147 dquot_initialize(dentry->d_inode);
2148
2140 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); 2149 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2141 if (IS_ERR(handle)) 2150 if (IS_ERR(handle))
2142 return PTR_ERR(handle); 2151 return PTR_ERR(handle);
@@ -2163,7 +2172,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2163 if (retval) 2172 if (retval)
2164 goto end_rmdir; 2173 goto end_rmdir;
2165 if (!EXT4_DIR_LINK_EMPTY(inode)) 2174 if (!EXT4_DIR_LINK_EMPTY(inode))
2166 ext4_warning(inode->i_sb, "ext4_rmdir", 2175 ext4_warning(inode->i_sb,
2167 "empty directory has too many links (%d)", 2176 "empty directory has too many links (%d)",
2168 inode->i_nlink); 2177 inode->i_nlink);
2169 inode->i_version++; 2178 inode->i_version++;
@@ -2195,7 +2204,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2195 2204
2196 /* Initialize quotas before so that eventual writes go 2205 /* Initialize quotas before so that eventual writes go
2197 * in separate transaction */ 2206 * in separate transaction */
2198 vfs_dq_init(dentry->d_inode); 2207 dquot_initialize(dir);
2208 dquot_initialize(dentry->d_inode);
2209
2199 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); 2210 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2200 if (IS_ERR(handle)) 2211 if (IS_ERR(handle))
2201 return PTR_ERR(handle); 2212 return PTR_ERR(handle);
@@ -2215,7 +2226,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2215 goto end_unlink; 2226 goto end_unlink;
2216 2227
2217 if (!inode->i_nlink) { 2228 if (!inode->i_nlink) {
2218 ext4_warning(inode->i_sb, "ext4_unlink", 2229 ext4_warning(inode->i_sb,
2219 "Deleting nonexistent file (%lu), %d", 2230 "Deleting nonexistent file (%lu), %d",
2220 inode->i_ino, inode->i_nlink); 2231 inode->i_ino, inode->i_nlink);
2221 inode->i_nlink = 1; 2232 inode->i_nlink = 1;
@@ -2250,6 +2261,8 @@ static int ext4_symlink(struct inode *dir,
2250 if (l > dir->i_sb->s_blocksize) 2261 if (l > dir->i_sb->s_blocksize)
2251 return -ENAMETOOLONG; 2262 return -ENAMETOOLONG;
2252 2263
2264 dquot_initialize(dir);
2265
2253retry: 2266retry:
2254 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2267 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2255 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + 2268 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
@@ -2308,6 +2321,8 @@ static int ext4_link(struct dentry *old_dentry,
2308 if (inode->i_nlink >= EXT4_LINK_MAX) 2321 if (inode->i_nlink >= EXT4_LINK_MAX)
2309 return -EMLINK; 2322 return -EMLINK;
2310 2323
2324 dquot_initialize(dir);
2325
2311 /* 2326 /*
2312 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing 2327 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
2313 * otherwise has the potential to corrupt the orphan inode list. 2328 * otherwise has the potential to corrupt the orphan inode list.
@@ -2358,12 +2373,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2358 struct ext4_dir_entry_2 *old_de, *new_de; 2373 struct ext4_dir_entry_2 *old_de, *new_de;
2359 int retval, force_da_alloc = 0; 2374 int retval, force_da_alloc = 0;
2360 2375
2376 dquot_initialize(old_dir);
2377 dquot_initialize(new_dir);
2378
2361 old_bh = new_bh = dir_bh = NULL; 2379 old_bh = new_bh = dir_bh = NULL;
2362 2380
2363 /* Initialize quotas before so that eventual writes go 2381 /* Initialize quotas before so that eventual writes go
2364 * in separate transaction */ 2382 * in separate transaction */
2365 if (new_dentry->d_inode) 2383 if (new_dentry->d_inode)
2366 vfs_dq_init(new_dentry->d_inode); 2384 dquot_initialize(new_dentry->d_inode);
2367 handle = ext4_journal_start(old_dir, 2 * 2385 handle = ext4_journal_start(old_dir, 2 *
2368 EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + 2386 EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2369 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); 2387 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
@@ -2462,7 +2480,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2462 } 2480 }
2463 } 2481 }
2464 if (retval) { 2482 if (retval) {
2465 ext4_warning(old_dir->i_sb, "ext4_rename", 2483 ext4_warning(old_dir->i_sb,
2466 "Deleting old file (%lu), %d, error=%d", 2484 "Deleting old file (%lu), %d, error=%d",
2467 old_dir->i_ino, old_dir->i_nlink, retval); 2485 old_dir->i_ino, old_dir->i_nlink, retval);
2468 } 2486 }
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3b2c5541d8a6..5692c48754a0 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -48,65 +48,54 @@ static int verify_group_input(struct super_block *sb,
48 48
49 ext4_get_group_no_and_offset(sb, start, NULL, &offset); 49 ext4_get_group_no_and_offset(sb, start, NULL, &offset);
50 if (group != sbi->s_groups_count) 50 if (group != sbi->s_groups_count)
51 ext4_warning(sb, __func__, 51 ext4_warning(sb, "Cannot add at group %u (only %u groups)",
52 "Cannot add at group %u (only %u groups)",
53 input->group, sbi->s_groups_count); 52 input->group, sbi->s_groups_count);
54 else if (offset != 0) 53 else if (offset != 0)
55 ext4_warning(sb, __func__, "Last group not full"); 54 ext4_warning(sb, "Last group not full");
56 else if (input->reserved_blocks > input->blocks_count / 5) 55 else if (input->reserved_blocks > input->blocks_count / 5)
57 ext4_warning(sb, __func__, "Reserved blocks too high (%u)", 56 ext4_warning(sb, "Reserved blocks too high (%u)",
58 input->reserved_blocks); 57 input->reserved_blocks);
59 else if (free_blocks_count < 0) 58 else if (free_blocks_count < 0)
60 ext4_warning(sb, __func__, "Bad blocks count %u", 59 ext4_warning(sb, "Bad blocks count %u",
61 input->blocks_count); 60 input->blocks_count);
62 else if (!(bh = sb_bread(sb, end - 1))) 61 else if (!(bh = sb_bread(sb, end - 1)))
63 ext4_warning(sb, __func__, 62 ext4_warning(sb, "Cannot read last block (%llu)",
64 "Cannot read last block (%llu)",
65 end - 1); 63 end - 1);
66 else if (outside(input->block_bitmap, start, end)) 64 else if (outside(input->block_bitmap, start, end))
67 ext4_warning(sb, __func__, 65 ext4_warning(sb, "Block bitmap not in group (block %llu)",
68 "Block bitmap not in group (block %llu)",
69 (unsigned long long)input->block_bitmap); 66 (unsigned long long)input->block_bitmap);
70 else if (outside(input->inode_bitmap, start, end)) 67 else if (outside(input->inode_bitmap, start, end))
71 ext4_warning(sb, __func__, 68 ext4_warning(sb, "Inode bitmap not in group (block %llu)",
72 "Inode bitmap not in group (block %llu)",
73 (unsigned long long)input->inode_bitmap); 69 (unsigned long long)input->inode_bitmap);
74 else if (outside(input->inode_table, start, end) || 70 else if (outside(input->inode_table, start, end) ||
75 outside(itend - 1, start, end)) 71 outside(itend - 1, start, end))
76 ext4_warning(sb, __func__, 72 ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)",
77 "Inode table not in group (blocks %llu-%llu)",
78 (unsigned long long)input->inode_table, itend - 1); 73 (unsigned long long)input->inode_table, itend - 1);
79 else if (input->inode_bitmap == input->block_bitmap) 74 else if (input->inode_bitmap == input->block_bitmap)
80 ext4_warning(sb, __func__, 75 ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)",
81 "Block bitmap same as inode bitmap (%llu)",
82 (unsigned long long)input->block_bitmap); 76 (unsigned long long)input->block_bitmap);
83 else if (inside(input->block_bitmap, input->inode_table, itend)) 77 else if (inside(input->block_bitmap, input->inode_table, itend))
84 ext4_warning(sb, __func__, 78 ext4_warning(sb, "Block bitmap (%llu) in inode table "
85 "Block bitmap (%llu) in inode table (%llu-%llu)", 79 "(%llu-%llu)",
86 (unsigned long long)input->block_bitmap, 80 (unsigned long long)input->block_bitmap,
87 (unsigned long long)input->inode_table, itend - 1); 81 (unsigned long long)input->inode_table, itend - 1);
88 else if (inside(input->inode_bitmap, input->inode_table, itend)) 82 else if (inside(input->inode_bitmap, input->inode_table, itend))
89 ext4_warning(sb, __func__, 83 ext4_warning(sb, "Inode bitmap (%llu) in inode table "
90 "Inode bitmap (%llu) in inode table (%llu-%llu)", 84 "(%llu-%llu)",
91 (unsigned long long)input->inode_bitmap, 85 (unsigned long long)input->inode_bitmap,
92 (unsigned long long)input->inode_table, itend - 1); 86 (unsigned long long)input->inode_table, itend - 1);
93 else if (inside(input->block_bitmap, start, metaend)) 87 else if (inside(input->block_bitmap, start, metaend))
94 ext4_warning(sb, __func__, 88 ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)",
95 "Block bitmap (%llu) in GDT table"
96 " (%llu-%llu)",
97 (unsigned long long)input->block_bitmap, 89 (unsigned long long)input->block_bitmap,
98 start, metaend - 1); 90 start, metaend - 1);
99 else if (inside(input->inode_bitmap, start, metaend)) 91 else if (inside(input->inode_bitmap, start, metaend))
100 ext4_warning(sb, __func__, 92 ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)",
101 "Inode bitmap (%llu) in GDT table"
102 " (%llu-%llu)",
103 (unsigned long long)input->inode_bitmap, 93 (unsigned long long)input->inode_bitmap,
104 start, metaend - 1); 94 start, metaend - 1);
105 else if (inside(input->inode_table, start, metaend) || 95 else if (inside(input->inode_table, start, metaend) ||
106 inside(itend - 1, start, metaend)) 96 inside(itend - 1, start, metaend))
107 ext4_warning(sb, __func__, 97 ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table "
108 "Inode table (%llu-%llu) overlaps" 98 "(%llu-%llu)",
109 "GDT table (%llu-%llu)",
110 (unsigned long long)input->inode_table, 99 (unsigned long long)input->inode_table,
111 itend - 1, start, metaend - 1); 100 itend - 1, start, metaend - 1);
112 else 101 else
@@ -364,8 +353,7 @@ static int verify_reserved_gdb(struct super_block *sb,
364 while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) { 353 while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
365 if (le32_to_cpu(*p++) != 354 if (le32_to_cpu(*p++) !=
366 grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){ 355 grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
367 ext4_warning(sb, __func__, 356 ext4_warning(sb, "reserved GDT %llu"
368 "reserved GDT %llu"
369 " missing grp %d (%llu)", 357 " missing grp %d (%llu)",
370 blk, grp, 358 blk, grp,
371 grp * 359 grp *
@@ -420,8 +408,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
420 */ 408 */
421 if (EXT4_SB(sb)->s_sbh->b_blocknr != 409 if (EXT4_SB(sb)->s_sbh->b_blocknr !=
422 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { 410 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
423 ext4_warning(sb, __func__, 411 ext4_warning(sb, "won't resize using backup superblock at %llu",
424 "won't resize using backup superblock at %llu",
425 (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); 412 (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
426 return -EPERM; 413 return -EPERM;
427 } 414 }
@@ -444,8 +431,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
444 431
445 data = (__le32 *)dind->b_data; 432 data = (__le32 *)dind->b_data;
446 if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { 433 if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
447 ext4_warning(sb, __func__, 434 ext4_warning(sb, "new group %u GDT block %llu not reserved",
448 "new group %u GDT block %llu not reserved",
449 input->group, gdblock); 435 input->group, gdblock);
450 err = -EINVAL; 436 err = -EINVAL;
451 goto exit_dind; 437 goto exit_dind;
@@ -468,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
468 GFP_NOFS); 454 GFP_NOFS);
469 if (!n_group_desc) { 455 if (!n_group_desc) {
470 err = -ENOMEM; 456 err = -ENOMEM;
471 ext4_warning(sb, __func__, 457 ext4_warning(sb,
472 "not enough memory for %lu groups", gdb_num + 1); 458 "not enough memory for %lu groups", gdb_num + 1);
473 goto exit_inode; 459 goto exit_inode;
474 } 460 }
@@ -567,8 +553,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
567 /* Get each reserved primary GDT block and verify it holds backups */ 553 /* Get each reserved primary GDT block and verify it holds backups */
568 for (res = 0; res < reserved_gdb; res++, blk++) { 554 for (res = 0; res < reserved_gdb; res++, blk++) {
569 if (le32_to_cpu(*data) != blk) { 555 if (le32_to_cpu(*data) != blk) {
570 ext4_warning(sb, __func__, 556 ext4_warning(sb, "reserved block %llu"
571 "reserved block %llu"
572 " not at offset %ld", 557 " not at offset %ld",
573 blk, 558 blk,
574 (long)(data - (__le32 *)dind->b_data)); 559 (long)(data - (__le32 *)dind->b_data));
@@ -713,8 +698,7 @@ static void update_backups(struct super_block *sb,
713 */ 698 */
714exit_err: 699exit_err:
715 if (err) { 700 if (err) {
716 ext4_warning(sb, __func__, 701 ext4_warning(sb, "can't update backup for group %u (err %d), "
717 "can't update backup for group %u (err %d), "
718 "forcing fsck on next reboot", group, err); 702 "forcing fsck on next reboot", group, err);
719 sbi->s_mount_state &= ~EXT4_VALID_FS; 703 sbi->s_mount_state &= ~EXT4_VALID_FS;
720 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); 704 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -753,20 +737,19 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
753 737
754 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, 738 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
755 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { 739 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
756 ext4_warning(sb, __func__, 740 ext4_warning(sb, "Can't resize non-sparse filesystem further");
757 "Can't resize non-sparse filesystem further");
758 return -EPERM; 741 return -EPERM;
759 } 742 }
760 743
761 if (ext4_blocks_count(es) + input->blocks_count < 744 if (ext4_blocks_count(es) + input->blocks_count <
762 ext4_blocks_count(es)) { 745 ext4_blocks_count(es)) {
763 ext4_warning(sb, __func__, "blocks_count overflow"); 746 ext4_warning(sb, "blocks_count overflow");
764 return -EINVAL; 747 return -EINVAL;
765 } 748 }
766 749
767 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < 750 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
768 le32_to_cpu(es->s_inodes_count)) { 751 le32_to_cpu(es->s_inodes_count)) {
769 ext4_warning(sb, __func__, "inodes_count overflow"); 752 ext4_warning(sb, "inodes_count overflow");
770 return -EINVAL; 753 return -EINVAL;
771 } 754 }
772 755
@@ -774,14 +757,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
774 if (!EXT4_HAS_COMPAT_FEATURE(sb, 757 if (!EXT4_HAS_COMPAT_FEATURE(sb,
775 EXT4_FEATURE_COMPAT_RESIZE_INODE) 758 EXT4_FEATURE_COMPAT_RESIZE_INODE)
776 || !le16_to_cpu(es->s_reserved_gdt_blocks)) { 759 || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
777 ext4_warning(sb, __func__, 760 ext4_warning(sb,
778 "No reserved GDT blocks, can't resize"); 761 "No reserved GDT blocks, can't resize");
779 return -EPERM; 762 return -EPERM;
780 } 763 }
781 inode = ext4_iget(sb, EXT4_RESIZE_INO); 764 inode = ext4_iget(sb, EXT4_RESIZE_INO);
782 if (IS_ERR(inode)) { 765 if (IS_ERR(inode)) {
783 ext4_warning(sb, __func__, 766 ext4_warning(sb, "Error opening resize inode");
784 "Error opening resize inode");
785 return PTR_ERR(inode); 767 return PTR_ERR(inode);
786 } 768 }
787 } 769 }
@@ -810,8 +792,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
810 792
811 mutex_lock(&sbi->s_resize_lock); 793 mutex_lock(&sbi->s_resize_lock);
812 if (input->group != sbi->s_groups_count) { 794 if (input->group != sbi->s_groups_count) {
813 ext4_warning(sb, __func__, 795 ext4_warning(sb, "multiple resizers run on filesystem!");
814 "multiple resizers run on filesystem!");
815 err = -EBUSY; 796 err = -EBUSY;
816 goto exit_journal; 797 goto exit_journal;
817 } 798 }
@@ -997,13 +978,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
997 " too large to resize to %llu blocks safely\n", 978 " too large to resize to %llu blocks safely\n",
998 sb->s_id, n_blocks_count); 979 sb->s_id, n_blocks_count);
999 if (sizeof(sector_t) < 8) 980 if (sizeof(sector_t) < 8)
1000 ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled"); 981 ext4_warning(sb, "CONFIG_LBDAF not enabled");
1001 return -EINVAL; 982 return -EINVAL;
1002 } 983 }
1003 984
1004 if (n_blocks_count < o_blocks_count) { 985 if (n_blocks_count < o_blocks_count) {
1005 ext4_warning(sb, __func__, 986 ext4_warning(sb, "can't shrink FS - resize aborted");
1006 "can't shrink FS - resize aborted");
1007 return -EBUSY; 987 return -EBUSY;
1008 } 988 }
1009 989
@@ -1011,15 +991,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1011 ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); 991 ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
1012 992
1013 if (last == 0) { 993 if (last == 0) {
1014 ext4_warning(sb, __func__, 994 ext4_warning(sb, "need to use ext2online to resize further");
1015 "need to use ext2online to resize further");
1016 return -EPERM; 995 return -EPERM;
1017 } 996 }
1018 997
1019 add = EXT4_BLOCKS_PER_GROUP(sb) - last; 998 add = EXT4_BLOCKS_PER_GROUP(sb) - last;
1020 999
1021 if (o_blocks_count + add < o_blocks_count) { 1000 if (o_blocks_count + add < o_blocks_count) {
1022 ext4_warning(sb, __func__, "blocks_count overflow"); 1001 ext4_warning(sb, "blocks_count overflow");
1023 return -EINVAL; 1002 return -EINVAL;
1024 } 1003 }
1025 1004
@@ -1027,16 +1006,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1027 add = n_blocks_count - o_blocks_count; 1006 add = n_blocks_count - o_blocks_count;
1028 1007
1029 if (o_blocks_count + add < n_blocks_count) 1008 if (o_blocks_count + add < n_blocks_count)
1030 ext4_warning(sb, __func__, 1009 ext4_warning(sb, "will only finish group (%llu blocks, %u new)",
1031 "will only finish group (%llu"
1032 " blocks, %u new)",
1033 o_blocks_count + add, add); 1010 o_blocks_count + add, add);
1034 1011
1035 /* See if the device is actually as big as what was requested */ 1012 /* See if the device is actually as big as what was requested */
1036 bh = sb_bread(sb, o_blocks_count + add - 1); 1013 bh = sb_bread(sb, o_blocks_count + add - 1);
1037 if (!bh) { 1014 if (!bh) {
1038 ext4_warning(sb, __func__, 1015 ext4_warning(sb, "can't read last block, resize aborted");
1039 "can't read last block, resize aborted");
1040 return -ENOSPC; 1016 return -ENOSPC;
1041 } 1017 }
1042 brelse(bh); 1018 brelse(bh);
@@ -1047,14 +1023,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1047 handle = ext4_journal_start_sb(sb, 3); 1023 handle = ext4_journal_start_sb(sb, 3);
1048 if (IS_ERR(handle)) { 1024 if (IS_ERR(handle)) {
1049 err = PTR_ERR(handle); 1025 err = PTR_ERR(handle);
1050 ext4_warning(sb, __func__, "error %d on journal start", err); 1026 ext4_warning(sb, "error %d on journal start", err);
1051 goto exit_put; 1027 goto exit_put;
1052 } 1028 }
1053 1029
1054 mutex_lock(&EXT4_SB(sb)->s_resize_lock); 1030 mutex_lock(&EXT4_SB(sb)->s_resize_lock);
1055 if (o_blocks_count != ext4_blocks_count(es)) { 1031 if (o_blocks_count != ext4_blocks_count(es)) {
1056 ext4_warning(sb, __func__, 1032 ext4_warning(sb, "multiple resizers run on filesystem!");
1057 "multiple resizers run on filesystem!");
1058 mutex_unlock(&EXT4_SB(sb)->s_resize_lock); 1033 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1059 ext4_journal_stop(handle); 1034 ext4_journal_stop(handle);
1060 err = -EBUSY; 1035 err = -EBUSY;
@@ -1063,8 +1038,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1063 1038
1064 if ((err = ext4_journal_get_write_access(handle, 1039 if ((err = ext4_journal_get_write_access(handle,
1065 EXT4_SB(sb)->s_sbh))) { 1040 EXT4_SB(sb)->s_sbh))) {
1066 ext4_warning(sb, __func__, 1041 ext4_warning(sb, "error %d on journal write access", err);
1067 "error %d on journal write access", err);
1068 mutex_unlock(&EXT4_SB(sb)->s_resize_lock); 1042 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1069 ext4_journal_stop(handle); 1043 ext4_journal_stop(handle);
1070 goto exit_put; 1044 goto exit_put;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6ed9aa91f27d..e14d22c170d5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -68,7 +68,21 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
68static int ext4_unfreeze(struct super_block *sb); 68static int ext4_unfreeze(struct super_block *sb);
69static void ext4_write_super(struct super_block *sb); 69static void ext4_write_super(struct super_block *sb);
70static int ext4_freeze(struct super_block *sb); 70static int ext4_freeze(struct super_block *sb);
71static int ext4_get_sb(struct file_system_type *fs_type, int flags,
72 const char *dev_name, void *data, struct vfsmount *mnt);
71 73
74#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
75static struct file_system_type ext3_fs_type = {
76 .owner = THIS_MODULE,
77 .name = "ext3",
78 .get_sb = ext4_get_sb,
79 .kill_sb = kill_block_super,
80 .fs_flags = FS_REQUIRES_DEV,
81};
82#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
83#else
84#define IS_EXT3_SB(sb) (0)
85#endif
72 86
73ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, 87ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
74 struct ext4_group_desc *bg) 88 struct ext4_group_desc *bg)
@@ -302,7 +316,7 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
302 * write out the superblock safely. 316 * write out the superblock safely.
303 * 317 *
304 * We'll just use the jbd2_journal_abort() error code to record an error in 318 * We'll just use the jbd2_journal_abort() error code to record an error in
305 * the journal instead. On recovery, the journal will compain about 319 * the journal instead. On recovery, the journal will complain about
306 * that error until we've noted it down and cleared it. 320 * that error until we've noted it down and cleared it.
307 */ 321 */
308 322
@@ -333,7 +347,7 @@ static void ext4_handle_error(struct super_block *sb)
333 sb->s_id); 347 sb->s_id);
334} 348}
335 349
336void ext4_error(struct super_block *sb, const char *function, 350void __ext4_error(struct super_block *sb, const char *function,
337 const char *fmt, ...) 351 const char *fmt, ...)
338{ 352{
339 va_list args; 353 va_list args;
@@ -347,6 +361,42 @@ void ext4_error(struct super_block *sb, const char *function,
347 ext4_handle_error(sb); 361 ext4_handle_error(sb);
348} 362}
349 363
364void ext4_error_inode(const char *function, struct inode *inode,
365 const char *fmt, ...)
366{
367 va_list args;
368
369 va_start(args, fmt);
370 printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ",
371 inode->i_sb->s_id, function, inode->i_ino, current->comm);
372 vprintk(fmt, args);
373 printk("\n");
374 va_end(args);
375
376 ext4_handle_error(inode->i_sb);
377}
378
379void ext4_error_file(const char *function, struct file *file,
380 const char *fmt, ...)
381{
382 va_list args;
383 struct inode *inode = file->f_dentry->d_inode;
384 char pathname[80], *path;
385
386 va_start(args, fmt);
387 path = d_path(&(file->f_path), pathname, sizeof(pathname));
388 if (!path)
389 path = "(unknown)";
390 printk(KERN_CRIT
391 "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ",
392 inode->i_sb->s_id, function, inode->i_ino, current->comm, path);
393 vprintk(fmt, args);
394 printk("\n");
395 va_end(args);
396
397 ext4_handle_error(inode->i_sb);
398}
399
350static const char *ext4_decode_error(struct super_block *sb, int errno, 400static const char *ext4_decode_error(struct super_block *sb, int errno,
351 char nbuf[16]) 401 char nbuf[16])
352{ 402{
@@ -450,7 +500,7 @@ void ext4_msg (struct super_block * sb, const char *prefix,
450 va_end(args); 500 va_end(args);
451} 501}
452 502
453void ext4_warning(struct super_block *sb, const char *function, 503void __ext4_warning(struct super_block *sb, const char *function,
454 const char *fmt, ...) 504 const char *fmt, ...)
455{ 505{
456 va_list args; 506 va_list args;
@@ -507,7 +557,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
507 if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) 557 if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
508 return; 558 return;
509 559
510 ext4_warning(sb, __func__, 560 ext4_warning(sb,
511 "updating to rev %d because of new feature flag, " 561 "updating to rev %d because of new feature flag, "
512 "running e2fsck is recommended", 562 "running e2fsck is recommended",
513 EXT4_DYNAMIC_REV); 563 EXT4_DYNAMIC_REV);
@@ -702,12 +752,14 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
702 ei->i_reserved_data_blocks = 0; 752 ei->i_reserved_data_blocks = 0;
703 ei->i_reserved_meta_blocks = 0; 753 ei->i_reserved_meta_blocks = 0;
704 ei->i_allocated_meta_blocks = 0; 754 ei->i_allocated_meta_blocks = 0;
755 ei->i_da_metadata_calc_len = 0;
705 ei->i_delalloc_reserved_flag = 0; 756 ei->i_delalloc_reserved_flag = 0;
706 spin_lock_init(&(ei->i_block_reservation_lock)); 757 spin_lock_init(&(ei->i_block_reservation_lock));
707#ifdef CONFIG_QUOTA 758#ifdef CONFIG_QUOTA
708 ei->i_reserved_quota = 0; 759 ei->i_reserved_quota = 0;
709#endif 760#endif
710 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); 761 INIT_LIST_HEAD(&ei->i_completed_io_list);
762 spin_lock_init(&ei->i_completed_io_lock);
711 ei->cur_aio_dio = NULL; 763 ei->cur_aio_dio = NULL;
712 ei->i_sync_tid = 0; 764 ei->i_sync_tid = 0;
713 ei->i_datasync_tid = 0; 765 ei->i_datasync_tid = 0;
@@ -760,6 +812,7 @@ static void destroy_inodecache(void)
760 812
761static void ext4_clear_inode(struct inode *inode) 813static void ext4_clear_inode(struct inode *inode)
762{ 814{
815 dquot_drop(inode);
763 ext4_discard_preallocations(inode); 816 ext4_discard_preallocations(inode);
764 if (EXT4_JOURNAL(inode)) 817 if (EXT4_JOURNAL(inode))
765 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, 818 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
@@ -795,10 +848,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
795 if (sbi->s_qf_names[GRPQUOTA]) 848 if (sbi->s_qf_names[GRPQUOTA])
796 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); 849 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
797 850
798 if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) 851 if (test_opt(sb, USRQUOTA))
799 seq_puts(seq, ",usrquota"); 852 seq_puts(seq, ",usrquota");
800 853
801 if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) 854 if (test_opt(sb, GRPQUOTA))
802 seq_puts(seq, ",grpquota"); 855 seq_puts(seq, ",grpquota");
803#endif 856#endif
804} 857}
@@ -925,6 +978,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
925 if (test_opt(sb, NOLOAD)) 978 if (test_opt(sb, NOLOAD))
926 seq_puts(seq, ",norecovery"); 979 seq_puts(seq, ",norecovery");
927 980
981 if (test_opt(sb, DIOREAD_NOLOCK))
982 seq_puts(seq, ",dioread_nolock");
983
928 ext4_show_quota_options(seq, sb); 984 ext4_show_quota_options(seq, sb);
929 985
930 return 0; 986 return 0;
@@ -1011,19 +1067,9 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
1011 const char *data, size_t len, loff_t off); 1067 const char *data, size_t len, loff_t off);
1012 1068
1013static const struct dquot_operations ext4_quota_operations = { 1069static const struct dquot_operations ext4_quota_operations = {
1014 .initialize = dquot_initialize,
1015 .drop = dquot_drop,
1016 .alloc_space = dquot_alloc_space,
1017 .reserve_space = dquot_reserve_space,
1018 .claim_space = dquot_claim_space,
1019 .release_rsv = dquot_release_reserved_space,
1020#ifdef CONFIG_QUOTA 1070#ifdef CONFIG_QUOTA
1021 .get_reserved_space = ext4_get_reserved_space, 1071 .get_reserved_space = ext4_get_reserved_space,
1022#endif 1072#endif
1023 .alloc_inode = dquot_alloc_inode,
1024 .free_space = dquot_free_space,
1025 .free_inode = dquot_free_inode,
1026 .transfer = dquot_transfer,
1027 .write_dquot = ext4_write_dquot, 1073 .write_dquot = ext4_write_dquot,
1028 .acquire_dquot = ext4_acquire_dquot, 1074 .acquire_dquot = ext4_acquire_dquot,
1029 .release_dquot = ext4_release_dquot, 1075 .release_dquot = ext4_release_dquot,
@@ -1108,6 +1154,7 @@ enum {
1108 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1154 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
1109 Opt_block_validity, Opt_noblock_validity, 1155 Opt_block_validity, Opt_noblock_validity,
1110 Opt_inode_readahead_blks, Opt_journal_ioprio, 1156 Opt_inode_readahead_blks, Opt_journal_ioprio,
1157 Opt_dioread_nolock, Opt_dioread_lock,
1111 Opt_discard, Opt_nodiscard, 1158 Opt_discard, Opt_nodiscard,
1112}; 1159};
1113 1160
@@ -1175,6 +1222,8 @@ static const match_table_t tokens = {
1175 {Opt_auto_da_alloc, "auto_da_alloc=%u"}, 1222 {Opt_auto_da_alloc, "auto_da_alloc=%u"},
1176 {Opt_auto_da_alloc, "auto_da_alloc"}, 1223 {Opt_auto_da_alloc, "auto_da_alloc"},
1177 {Opt_noauto_da_alloc, "noauto_da_alloc"}, 1224 {Opt_noauto_da_alloc, "noauto_da_alloc"},
1225 {Opt_dioread_nolock, "dioread_nolock"},
1226 {Opt_dioread_lock, "dioread_lock"},
1178 {Opt_discard, "discard"}, 1227 {Opt_discard, "discard"},
1179 {Opt_nodiscard, "nodiscard"}, 1228 {Opt_nodiscard, "nodiscard"},
1180 {Opt_err, NULL}, 1229 {Opt_err, NULL},
@@ -1204,6 +1253,66 @@ static ext4_fsblk_t get_sb_block(void **data)
1204} 1253}
1205 1254
1206#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) 1255#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1256static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
1257 "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
1258
1259#ifdef CONFIG_QUOTA
1260static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1261{
1262 struct ext4_sb_info *sbi = EXT4_SB(sb);
1263 char *qname;
1264
1265 if (sb_any_quota_loaded(sb) &&
1266 !sbi->s_qf_names[qtype]) {
1267 ext4_msg(sb, KERN_ERR,
1268 "Cannot change journaled "
1269 "quota options when quota turned on");
1270 return 0;
1271 }
1272 qname = match_strdup(args);
1273 if (!qname) {
1274 ext4_msg(sb, KERN_ERR,
1275 "Not enough memory for storing quotafile name");
1276 return 0;
1277 }
1278 if (sbi->s_qf_names[qtype] &&
1279 strcmp(sbi->s_qf_names[qtype], qname)) {
1280 ext4_msg(sb, KERN_ERR,
1281 "%s quota file already specified", QTYPE2NAME(qtype));
1282 kfree(qname);
1283 return 0;
1284 }
1285 sbi->s_qf_names[qtype] = qname;
1286 if (strchr(sbi->s_qf_names[qtype], '/')) {
1287 ext4_msg(sb, KERN_ERR,
1288 "quotafile must be on filesystem root");
1289 kfree(sbi->s_qf_names[qtype]);
1290 sbi->s_qf_names[qtype] = NULL;
1291 return 0;
1292 }
1293 set_opt(sbi->s_mount_opt, QUOTA);
1294 return 1;
1295}
1296
1297static int clear_qf_name(struct super_block *sb, int qtype)
1298{
1299
1300 struct ext4_sb_info *sbi = EXT4_SB(sb);
1301
1302 if (sb_any_quota_loaded(sb) &&
1303 sbi->s_qf_names[qtype]) {
1304 ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
1305 " when quota turned on");
1306 return 0;
1307 }
1308 /*
1309 * The space will be released later when all options are confirmed
1310 * to be correct
1311 */
1312 sbi->s_qf_names[qtype] = NULL;
1313 return 1;
1314}
1315#endif
1207 1316
1208static int parse_options(char *options, struct super_block *sb, 1317static int parse_options(char *options, struct super_block *sb,
1209 unsigned long *journal_devnum, 1318 unsigned long *journal_devnum,
@@ -1216,8 +1325,7 @@ static int parse_options(char *options, struct super_block *sb,
1216 int data_opt = 0; 1325 int data_opt = 0;
1217 int option; 1326 int option;
1218#ifdef CONFIG_QUOTA 1327#ifdef CONFIG_QUOTA
1219 int qtype, qfmt; 1328 int qfmt;
1220 char *qname;
1221#endif 1329#endif
1222 1330
1223 if (!options) 1331 if (!options)
@@ -1228,19 +1336,31 @@ static int parse_options(char *options, struct super_block *sb,
1228 if (!*p) 1336 if (!*p)
1229 continue; 1337 continue;
1230 1338
1339 /*
1340 * Initialize args struct so we know whether arg was
1341 * found; some options take optional arguments.
1342 */
1343 args[0].to = args[0].from = 0;
1231 token = match_token(p, tokens, args); 1344 token = match_token(p, tokens, args);
1232 switch (token) { 1345 switch (token) {
1233 case Opt_bsd_df: 1346 case Opt_bsd_df:
1347 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1234 clear_opt(sbi->s_mount_opt, MINIX_DF); 1348 clear_opt(sbi->s_mount_opt, MINIX_DF);
1235 break; 1349 break;
1236 case Opt_minix_df: 1350 case Opt_minix_df:
1351 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1237 set_opt(sbi->s_mount_opt, MINIX_DF); 1352 set_opt(sbi->s_mount_opt, MINIX_DF);
1353
1238 break; 1354 break;
1239 case Opt_grpid: 1355 case Opt_grpid:
1356 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1240 set_opt(sbi->s_mount_opt, GRPID); 1357 set_opt(sbi->s_mount_opt, GRPID);
1358
1241 break; 1359 break;
1242 case Opt_nogrpid: 1360 case Opt_nogrpid:
1361 ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1243 clear_opt(sbi->s_mount_opt, GRPID); 1362 clear_opt(sbi->s_mount_opt, GRPID);
1363
1244 break; 1364 break;
1245 case Opt_resuid: 1365 case Opt_resuid:
1246 if (match_int(&args[0], &option)) 1366 if (match_int(&args[0], &option))
@@ -1377,14 +1497,13 @@ static int parse_options(char *options, struct super_block *sb,
1377 data_opt = EXT4_MOUNT_WRITEBACK_DATA; 1497 data_opt = EXT4_MOUNT_WRITEBACK_DATA;
1378 datacheck: 1498 datacheck:
1379 if (is_remount) { 1499 if (is_remount) {
1380 if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS) 1500 if (test_opt(sb, DATA_FLAGS) != data_opt) {
1381 != data_opt) {
1382 ext4_msg(sb, KERN_ERR, 1501 ext4_msg(sb, KERN_ERR,
1383 "Cannot change data mode on remount"); 1502 "Cannot change data mode on remount");
1384 return 0; 1503 return 0;
1385 } 1504 }
1386 } else { 1505 } else {
1387 sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS; 1506 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
1388 sbi->s_mount_opt |= data_opt; 1507 sbi->s_mount_opt |= data_opt;
1389 } 1508 }
1390 break; 1509 break;
@@ -1396,63 +1515,22 @@ static int parse_options(char *options, struct super_block *sb,
1396 break; 1515 break;
1397#ifdef CONFIG_QUOTA 1516#ifdef CONFIG_QUOTA
1398 case Opt_usrjquota: 1517 case Opt_usrjquota:
1399 qtype = USRQUOTA; 1518 if (!set_qf_name(sb, USRQUOTA, &args[0]))
1400 goto set_qf_name;
1401 case Opt_grpjquota:
1402 qtype = GRPQUOTA;
1403set_qf_name:
1404 if (sb_any_quota_loaded(sb) &&
1405 !sbi->s_qf_names[qtype]) {
1406 ext4_msg(sb, KERN_ERR,
1407 "Cannot change journaled "
1408 "quota options when quota turned on");
1409 return 0; 1519 return 0;
1410 } 1520 break;
1411 qname = match_strdup(&args[0]); 1521 case Opt_grpjquota:
1412 if (!qname) { 1522 if (!set_qf_name(sb, GRPQUOTA, &args[0]))
1413 ext4_msg(sb, KERN_ERR,
1414 "Not enough memory for "
1415 "storing quotafile name");
1416 return 0;
1417 }
1418 if (sbi->s_qf_names[qtype] &&
1419 strcmp(sbi->s_qf_names[qtype], qname)) {
1420 ext4_msg(sb, KERN_ERR,
1421 "%s quota file already "
1422 "specified", QTYPE2NAME(qtype));
1423 kfree(qname);
1424 return 0;
1425 }
1426 sbi->s_qf_names[qtype] = qname;
1427 if (strchr(sbi->s_qf_names[qtype], '/')) {
1428 ext4_msg(sb, KERN_ERR,
1429 "quotafile must be on "
1430 "filesystem root");
1431 kfree(sbi->s_qf_names[qtype]);
1432 sbi->s_qf_names[qtype] = NULL;
1433 return 0; 1523 return 0;
1434 }
1435 set_opt(sbi->s_mount_opt, QUOTA);
1436 break; 1524 break;
1437 case Opt_offusrjquota: 1525 case Opt_offusrjquota:
1438 qtype = USRQUOTA; 1526 if (!clear_qf_name(sb, USRQUOTA))
1439 goto clear_qf_name; 1527 return 0;
1528 break;
1440 case Opt_offgrpjquota: 1529 case Opt_offgrpjquota:
1441 qtype = GRPQUOTA; 1530 if (!clear_qf_name(sb, GRPQUOTA))
1442clear_qf_name:
1443 if (sb_any_quota_loaded(sb) &&
1444 sbi->s_qf_names[qtype]) {
1445 ext4_msg(sb, KERN_ERR, "Cannot change "
1446 "journaled quota options when "
1447 "quota turned on");
1448 return 0; 1531 return 0;
1449 }
1450 /*
1451 * The space will be released later when all options
1452 * are confirmed to be correct
1453 */
1454 sbi->s_qf_names[qtype] = NULL;
1455 break; 1532 break;
1533
1456 case Opt_jqfmt_vfsold: 1534 case Opt_jqfmt_vfsold:
1457 qfmt = QFMT_VFS_OLD; 1535 qfmt = QFMT_VFS_OLD;
1458 goto set_qf_format; 1536 goto set_qf_format;
@@ -1517,10 +1595,11 @@ set_qf_format:
1517 clear_opt(sbi->s_mount_opt, BARRIER); 1595 clear_opt(sbi->s_mount_opt, BARRIER);
1518 break; 1596 break;
1519 case Opt_barrier: 1597 case Opt_barrier:
1520 if (match_int(&args[0], &option)) { 1598 if (args[0].from) {
1521 set_opt(sbi->s_mount_opt, BARRIER); 1599 if (match_int(&args[0], &option))
1522 break; 1600 return 0;
1523 } 1601 } else
1602 option = 1; /* No argument, default to 1 */
1524 if (option) 1603 if (option)
1525 set_opt(sbi->s_mount_opt, BARRIER); 1604 set_opt(sbi->s_mount_opt, BARRIER);
1526 else 1605 else
@@ -1593,10 +1672,11 @@ set_qf_format:
1593 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1672 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
1594 break; 1673 break;
1595 case Opt_auto_da_alloc: 1674 case Opt_auto_da_alloc:
1596 if (match_int(&args[0], &option)) { 1675 if (args[0].from) {
1597 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); 1676 if (match_int(&args[0], &option))
1598 break; 1677 return 0;
1599 } 1678 } else
1679 option = 1; /* No argument, default to 1 */
1600 if (option) 1680 if (option)
1601 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); 1681 clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
1602 else 1682 else
@@ -1608,6 +1688,12 @@ set_qf_format:
1608 case Opt_nodiscard: 1688 case Opt_nodiscard:
1609 clear_opt(sbi->s_mount_opt, DISCARD); 1689 clear_opt(sbi->s_mount_opt, DISCARD);
1610 break; 1690 break;
1691 case Opt_dioread_nolock:
1692 set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
1693 break;
1694 case Opt_dioread_lock:
1695 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
1696 break;
1611 default: 1697 default:
1612 ext4_msg(sb, KERN_ERR, 1698 ext4_msg(sb, KERN_ERR,
1613 "Unrecognized mount option \"%s\" " 1699 "Unrecognized mount option \"%s\" "
@@ -1617,18 +1703,13 @@ set_qf_format:
1617 } 1703 }
1618#ifdef CONFIG_QUOTA 1704#ifdef CONFIG_QUOTA
1619 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 1705 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1620 if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) && 1706 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
1621 sbi->s_qf_names[USRQUOTA])
1622 clear_opt(sbi->s_mount_opt, USRQUOTA); 1707 clear_opt(sbi->s_mount_opt, USRQUOTA);
1623 1708
1624 if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) && 1709 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
1625 sbi->s_qf_names[GRPQUOTA])
1626 clear_opt(sbi->s_mount_opt, GRPQUOTA); 1710 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1627 1711
1628 if ((sbi->s_qf_names[USRQUOTA] && 1712 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
1629 (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
1630 (sbi->s_qf_names[GRPQUOTA] &&
1631 (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
1632 ext4_msg(sb, KERN_ERR, "old and new quota " 1713 ext4_msg(sb, KERN_ERR, "old and new quota "
1633 "format mixing"); 1714 "format mixing");
1634 return 0; 1715 return 0;
@@ -1938,7 +2019,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1938 } 2019 }
1939 2020
1940 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 2021 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
1941 vfs_dq_init(inode); 2022 dquot_initialize(inode);
1942 if (inode->i_nlink) { 2023 if (inode->i_nlink) {
1943 ext4_msg(sb, KERN_DEBUG, 2024 ext4_msg(sb, KERN_DEBUG,
1944 "%s: truncating inode %lu to %lld bytes", 2025 "%s: truncating inode %lu to %lld bytes",
@@ -2174,9 +2255,9 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
2174 struct super_block *sb = sbi->s_buddy_cache->i_sb; 2255 struct super_block *sb = sbi->s_buddy_cache->i_sb;
2175 2256
2176 return snprintf(buf, PAGE_SIZE, "%llu\n", 2257 return snprintf(buf, PAGE_SIZE, "%llu\n",
2177 sbi->s_kbytes_written + 2258 (unsigned long long)(sbi->s_kbytes_written +
2178 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 2259 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2179 EXT4_SB(sb)->s_sectors_written_start) >> 1)); 2260 EXT4_SB(sb)->s_sectors_written_start) >> 1)));
2180} 2261}
2181 2262
2182static ssize_t inode_readahead_blks_store(struct ext4_attr *a, 2263static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
@@ -2291,7 +2372,7 @@ static void ext4_sb_release(struct kobject *kobj)
2291} 2372}
2292 2373
2293 2374
2294static struct sysfs_ops ext4_attr_ops = { 2375static const struct sysfs_ops ext4_attr_ops = {
2295 .show = ext4_attr_show, 2376 .show = ext4_attr_show,
2296 .store = ext4_attr_store, 2377 .store = ext4_attr_store,
2297}; 2378};
@@ -2431,8 +2512,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2431 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 2512 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
2432 if (def_mount_opts & EXT4_DEFM_DEBUG) 2513 if (def_mount_opts & EXT4_DEFM_DEBUG)
2433 set_opt(sbi->s_mount_opt, DEBUG); 2514 set_opt(sbi->s_mount_opt, DEBUG);
2434 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) 2515 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
2516 ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
2517 "2.6.38");
2435 set_opt(sbi->s_mount_opt, GRPID); 2518 set_opt(sbi->s_mount_opt, GRPID);
2519 }
2436 if (def_mount_opts & EXT4_DEFM_UID16) 2520 if (def_mount_opts & EXT4_DEFM_UID16)
2437 set_opt(sbi->s_mount_opt, NO_UID32); 2521 set_opt(sbi->s_mount_opt, NO_UID32);
2438#ifdef CONFIG_EXT4_FS_XATTR 2522#ifdef CONFIG_EXT4_FS_XATTR
@@ -2444,11 +2528,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2444 set_opt(sbi->s_mount_opt, POSIX_ACL); 2528 set_opt(sbi->s_mount_opt, POSIX_ACL);
2445#endif 2529#endif
2446 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) 2530 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
2447 sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; 2531 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
2448 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) 2532 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
2449 sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA; 2533 set_opt(sbi->s_mount_opt, ORDERED_DATA);
2450 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) 2534 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
2451 sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA; 2535 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
2452 2536
2453 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) 2537 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
2454 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 2538 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -2469,14 +2553,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2469 * enable delayed allocation by default 2553 * enable delayed allocation by default
2470 * Use -o nodelalloc to turn it off 2554 * Use -o nodelalloc to turn it off
2471 */ 2555 */
2472 set_opt(sbi->s_mount_opt, DELALLOC); 2556 if (!IS_EXT3_SB(sb))
2557 set_opt(sbi->s_mount_opt, DELALLOC);
2473 2558
2474 if (!parse_options((char *) data, sb, &journal_devnum, 2559 if (!parse_options((char *) data, sb, &journal_devnum,
2475 &journal_ioprio, NULL, 0)) 2560 &journal_ioprio, NULL, 0))
2476 goto failed_mount; 2561 goto failed_mount;
2477 2562
2478 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 2563 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
2479 ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 2564 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
2480 2565
2481 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && 2566 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
2482 (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || 2567 (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@ -2765,7 +2850,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2765 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { 2850 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2766 ext4_msg(sb, KERN_ERR, "required journal recovery " 2851 ext4_msg(sb, KERN_ERR, "required journal recovery "
2767 "suppressed and not mounted read-only"); 2852 "suppressed and not mounted read-only");
2768 goto failed_mount4; 2853 goto failed_mount_wq;
2769 } else { 2854 } else {
2770 clear_opt(sbi->s_mount_opt, DATA_FLAGS); 2855 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
2771 set_opt(sbi->s_mount_opt, WRITEBACK_DATA); 2856 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
@@ -2778,7 +2863,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2778 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 2863 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
2779 JBD2_FEATURE_INCOMPAT_64BIT)) { 2864 JBD2_FEATURE_INCOMPAT_64BIT)) {
2780 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); 2865 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
2781 goto failed_mount4; 2866 goto failed_mount_wq;
2782 } 2867 }
2783 2868
2784 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 2869 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
@@ -2817,7 +2902,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2817 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { 2902 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
2818 ext4_msg(sb, KERN_ERR, "Journal does not support " 2903 ext4_msg(sb, KERN_ERR, "Journal does not support "
2819 "requested data journaling mode"); 2904 "requested data journaling mode");
2820 goto failed_mount4; 2905 goto failed_mount_wq;
2821 } 2906 }
2822 default: 2907 default:
2823 break; 2908 break;
@@ -2825,13 +2910,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2825 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 2910 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
2826 2911
2827no_journal: 2912no_journal:
2828
2829 if (test_opt(sb, NOBH)) { 2913 if (test_opt(sb, NOBH)) {
2830 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2914 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
2831 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " 2915 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
2832 "its supported only with writeback mode"); 2916 "its supported only with writeback mode");
2833 clear_opt(sbi->s_mount_opt, NOBH); 2917 clear_opt(sbi->s_mount_opt, NOBH);
2834 } 2918 }
2919 if (test_opt(sb, DIOREAD_NOLOCK)) {
2920 ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
2921 "not supported with nobh mode");
2922 goto failed_mount_wq;
2923 }
2835 } 2924 }
2836 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 2925 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2837 if (!EXT4_SB(sb)->dio_unwritten_wq) { 2926 if (!EXT4_SB(sb)->dio_unwritten_wq) {
@@ -2896,6 +2985,18 @@ no_journal:
2896 "requested data journaling mode"); 2985 "requested data journaling mode");
2897 clear_opt(sbi->s_mount_opt, DELALLOC); 2986 clear_opt(sbi->s_mount_opt, DELALLOC);
2898 } 2987 }
2988 if (test_opt(sb, DIOREAD_NOLOCK)) {
2989 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
2990 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
2991 "option - requested data journaling mode");
2992 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
2993 }
2994 if (sb->s_blocksize < PAGE_SIZE) {
2995 ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
2996 "option - block size is too small");
2997 clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
2998 }
2999 }
2899 3000
2900 err = ext4_setup_system_zone(sb); 3001 err = ext4_setup_system_zone(sb);
2901 if (err) { 3002 if (err) {
@@ -3359,10 +3460,9 @@ static void ext4_clear_journal_err(struct super_block *sb,
3359 char nbuf[16]; 3460 char nbuf[16];
3360 3461
3361 errstr = ext4_decode_error(sb, j_errno, nbuf); 3462 errstr = ext4_decode_error(sb, j_errno, nbuf);
3362 ext4_warning(sb, __func__, "Filesystem error recorded " 3463 ext4_warning(sb, "Filesystem error recorded "
3363 "from previous mount: %s", errstr); 3464 "from previous mount: %s", errstr);
3364 ext4_warning(sb, __func__, "Marking fs in need of " 3465 ext4_warning(sb, "Marking fs in need of filesystem check.");
3365 "filesystem check.");
3366 3466
3367 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 3467 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
3368 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 3468 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -3513,7 +3613,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3513 ext4_abort(sb, __func__, "Abort forced by user"); 3613 ext4_abort(sb, __func__, "Abort forced by user");
3514 3614
3515 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 3615 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
3516 ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 3616 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
3517 3617
3518 es = sbi->s_es; 3618 es = sbi->s_es;
3519 3619
@@ -3707,7 +3807,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
3707 * Process 1 Process 2 3807 * Process 1 Process 2
3708 * ext4_create() quota_sync() 3808 * ext4_create() quota_sync()
3709 * jbd2_journal_start() write_dquot() 3809 * jbd2_journal_start() write_dquot()
3710 * vfs_dq_init() down(dqio_mutex) 3810 * dquot_initialize() down(dqio_mutex)
3711 * down(dqio_mutex) jbd2_journal_start() 3811 * down(dqio_mutex) jbd2_journal_start()
3712 * 3812 *
3713 */ 3813 */
@@ -3916,9 +4016,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3916 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 4016 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
3917 int err = 0; 4017 int err = 0;
3918 int offset = off & (sb->s_blocksize - 1); 4018 int offset = off & (sb->s_blocksize - 1);
3919 int tocopy;
3920 int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL; 4019 int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
3921 size_t towrite = len;
3922 struct buffer_head *bh; 4020 struct buffer_head *bh;
3923 handle_t *handle = journal_current_handle(); 4021 handle_t *handle = journal_current_handle();
3924 4022
@@ -3928,52 +4026,53 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3928 (unsigned long long)off, (unsigned long long)len); 4026 (unsigned long long)off, (unsigned long long)len);
3929 return -EIO; 4027 return -EIO;
3930 } 4028 }
4029 /*
4030 * Since we account only one data block in transaction credits,
4031 * then it is impossible to cross a block boundary.
4032 */
4033 if (sb->s_blocksize - offset < len) {
4034 ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
4035 " cancelled because not block aligned",
4036 (unsigned long long)off, (unsigned long long)len);
4037 return -EIO;
4038 }
4039
3931 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); 4040 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
3932 while (towrite > 0) { 4041 bh = ext4_bread(handle, inode, blk, 1, &err);
3933 tocopy = sb->s_blocksize - offset < towrite ? 4042 if (!bh)
3934 sb->s_blocksize - offset : towrite; 4043 goto out;
3935 bh = ext4_bread(handle, inode, blk, 1, &err); 4044 if (journal_quota) {
3936 if (!bh) 4045 err = ext4_journal_get_write_access(handle, bh);
4046 if (err) {
4047 brelse(bh);
3937 goto out; 4048 goto out;
3938 if (journal_quota) {
3939 err = ext4_journal_get_write_access(handle, bh);
3940 if (err) {
3941 brelse(bh);
3942 goto out;
3943 }
3944 }
3945 lock_buffer(bh);
3946 memcpy(bh->b_data+offset, data, tocopy);
3947 flush_dcache_page(bh->b_page);
3948 unlock_buffer(bh);
3949 if (journal_quota)
3950 err = ext4_handle_dirty_metadata(handle, NULL, bh);
3951 else {
3952 /* Always do at least ordered writes for quotas */
3953 err = ext4_jbd2_file_inode(handle, inode);
3954 mark_buffer_dirty(bh);
3955 } 4049 }
3956 brelse(bh);
3957 if (err)
3958 goto out;
3959 offset = 0;
3960 towrite -= tocopy;
3961 data += tocopy;
3962 blk++;
3963 } 4050 }
4051 lock_buffer(bh);
4052 memcpy(bh->b_data+offset, data, len);
4053 flush_dcache_page(bh->b_page);
4054 unlock_buffer(bh);
4055 if (journal_quota)
4056 err = ext4_handle_dirty_metadata(handle, NULL, bh);
4057 else {
4058 /* Always do at least ordered writes for quotas */
4059 err = ext4_jbd2_file_inode(handle, inode);
4060 mark_buffer_dirty(bh);
4061 }
4062 brelse(bh);
3964out: 4063out:
3965 if (len == towrite) { 4064 if (err) {
3966 mutex_unlock(&inode->i_mutex); 4065 mutex_unlock(&inode->i_mutex);
3967 return err; 4066 return err;
3968 } 4067 }
3969 if (inode->i_size < off+len-towrite) { 4068 if (inode->i_size < off + len) {
3970 i_size_write(inode, off+len-towrite); 4069 i_size_write(inode, off + len);
3971 EXT4_I(inode)->i_disksize = inode->i_size; 4070 EXT4_I(inode)->i_disksize = inode->i_size;
3972 } 4071 }
3973 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 4072 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
3974 ext4_mark_inode_dirty(handle, inode); 4073 ext4_mark_inode_dirty(handle, inode);
3975 mutex_unlock(&inode->i_mutex); 4074 mutex_unlock(&inode->i_mutex);
3976 return len - towrite; 4075 return len;
3977} 4076}
3978 4077
3979#endif 4078#endif
@@ -3984,7 +4083,7 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
3984 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); 4083 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
3985} 4084}
3986 4085
3987#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4086#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
3988static struct file_system_type ext2_fs_type = { 4087static struct file_system_type ext2_fs_type = {
3989 .owner = THIS_MODULE, 4088 .owner = THIS_MODULE,
3990 .name = "ext2", 4089 .name = "ext2",
@@ -4005,20 +4104,13 @@ static inline void unregister_as_ext2(void)
4005{ 4104{
4006 unregister_filesystem(&ext2_fs_type); 4105 unregister_filesystem(&ext2_fs_type);
4007} 4106}
4107MODULE_ALIAS("ext2");
4008#else 4108#else
4009static inline void register_as_ext2(void) { } 4109static inline void register_as_ext2(void) { }
4010static inline void unregister_as_ext2(void) { } 4110static inline void unregister_as_ext2(void) { }
4011#endif 4111#endif
4012 4112
4013#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4113#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4014static struct file_system_type ext3_fs_type = {
4015 .owner = THIS_MODULE,
4016 .name = "ext3",
4017 .get_sb = ext4_get_sb,
4018 .kill_sb = kill_block_super,
4019 .fs_flags = FS_REQUIRES_DEV,
4020};
4021
4022static inline void register_as_ext3(void) 4114static inline void register_as_ext3(void)
4023{ 4115{
4024 int err = register_filesystem(&ext3_fs_type); 4116 int err = register_filesystem(&ext3_fs_type);
@@ -4031,6 +4123,7 @@ static inline void unregister_as_ext3(void)
4031{ 4123{
4032 unregister_filesystem(&ext3_fs_type); 4124 unregister_filesystem(&ext3_fs_type);
4033} 4125}
4126MODULE_ALIAS("ext3");
4034#else 4127#else
4035static inline void register_as_ext3(void) { } 4128static inline void register_as_ext3(void) { }
4036static inline void unregister_as_ext3(void) { } 4129static inline void unregister_as_ext3(void) { }
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 83218bebbc7c..b4c5aa8489d8 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -227,7 +227,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
227 ea_bdebug(bh, "b_count=%d, refcount=%d", 227 ea_bdebug(bh, "b_count=%d, refcount=%d",
228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
229 if (ext4_xattr_check_block(bh)) { 229 if (ext4_xattr_check_block(bh)) {
230bad_block: ext4_error(inode->i_sb, __func__, 230bad_block:
231 ext4_error(inode->i_sb,
231 "inode %lu: bad block %llu", inode->i_ino, 232 "inode %lu: bad block %llu", inode->i_ino,
232 EXT4_I(inode)->i_file_acl); 233 EXT4_I(inode)->i_file_acl);
233 error = -EIO; 234 error = -EIO;
@@ -267,7 +268,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
267 void *end; 268 void *end;
268 int error; 269 int error;
269 270
270 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) 271 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
271 return -ENODATA; 272 return -ENODATA;
272 error = ext4_get_inode_loc(inode, &iloc); 273 error = ext4_get_inode_loc(inode, &iloc);
273 if (error) 274 if (error)
@@ -371,7 +372,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
371 ea_bdebug(bh, "b_count=%d, refcount=%d", 372 ea_bdebug(bh, "b_count=%d, refcount=%d",
372 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 373 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
373 if (ext4_xattr_check_block(bh)) { 374 if (ext4_xattr_check_block(bh)) {
374 ext4_error(inode->i_sb, __func__, 375 ext4_error(inode->i_sb,
375 "inode %lu: bad block %llu", inode->i_ino, 376 "inode %lu: bad block %llu", inode->i_ino,
376 EXT4_I(inode)->i_file_acl); 377 EXT4_I(inode)->i_file_acl);
377 error = -EIO; 378 error = -EIO;
@@ -396,7 +397,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
396 void *end; 397 void *end;
397 int error; 398 int error;
398 399
399 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) 400 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
400 return 0; 401 return 0;
401 error = ext4_get_inode_loc(inode, &iloc); 402 error = ext4_get_inode_loc(inode, &iloc);
402 if (error) 403 if (error)
@@ -494,7 +495,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
494 error = ext4_handle_dirty_metadata(handle, inode, bh); 495 error = ext4_handle_dirty_metadata(handle, inode, bh);
495 if (IS_SYNC(inode)) 496 if (IS_SYNC(inode))
496 ext4_handle_sync(handle); 497 ext4_handle_sync(handle);
497 vfs_dq_free_block(inode, 1); 498 dquot_free_block(inode, 1);
498 ea_bdebug(bh, "refcount now=%d; releasing", 499 ea_bdebug(bh, "refcount now=%d; releasing",
499 le32_to_cpu(BHDR(bh)->h_refcount)); 500 le32_to_cpu(BHDR(bh)->h_refcount));
500 if (ce) 501 if (ce)
@@ -665,9 +666,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
665 atomic_read(&(bs->bh->b_count)), 666 atomic_read(&(bs->bh->b_count)),
666 le32_to_cpu(BHDR(bs->bh)->h_refcount)); 667 le32_to_cpu(BHDR(bs->bh)->h_refcount));
667 if (ext4_xattr_check_block(bs->bh)) { 668 if (ext4_xattr_check_block(bs->bh)) {
668 ext4_error(sb, __func__, 669 ext4_error(sb, "inode %lu: bad block %llu",
669 "inode %lu: bad block %llu", inode->i_ino, 670 inode->i_ino, EXT4_I(inode)->i_file_acl);
670 EXT4_I(inode)->i_file_acl);
671 error = -EIO; 671 error = -EIO;
672 goto cleanup; 672 goto cleanup;
673 } 673 }
@@ -787,8 +787,8 @@ inserted:
787 else { 787 else {
788 /* The old block is released after updating 788 /* The old block is released after updating
789 the inode. */ 789 the inode. */
790 error = -EDQUOT; 790 error = dquot_alloc_block(inode, 1);
791 if (vfs_dq_alloc_block(inode, 1)) 791 if (error)
792 goto cleanup; 792 goto cleanup;
793 error = ext4_journal_get_write_access(handle, 793 error = ext4_journal_get_write_access(handle,
794 new_bh); 794 new_bh);
@@ -876,13 +876,12 @@ cleanup:
876 return error; 876 return error;
877 877
878cleanup_dquot: 878cleanup_dquot:
879 vfs_dq_free_block(inode, 1); 879 dquot_free_block(inode, 1);
880 goto cleanup; 880 goto cleanup;
881 881
882bad_block: 882bad_block:
883 ext4_error(inode->i_sb, __func__, 883 ext4_error(inode->i_sb, "inode %lu: bad block %llu",
884 "inode %lu: bad block %llu", inode->i_ino, 884 inode->i_ino, EXT4_I(inode)->i_file_acl);
885 EXT4_I(inode)->i_file_acl);
886 goto cleanup; 885 goto cleanup;
887 886
888#undef header 887#undef header
@@ -908,7 +907,7 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
908 is->s.base = is->s.first = IFIRST(header); 907 is->s.base = is->s.first = IFIRST(header);
909 is->s.here = is->s.first; 908 is->s.here = is->s.first;
910 is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; 909 is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
911 if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { 910 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
912 error = ext4_xattr_check_names(IFIRST(header), is->s.end); 911 error = ext4_xattr_check_names(IFIRST(header), is->s.end);
913 if (error) 912 if (error)
914 return error; 913 return error;
@@ -940,10 +939,10 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
940 header = IHDR(inode, ext4_raw_inode(&is->iloc)); 939 header = IHDR(inode, ext4_raw_inode(&is->iloc));
941 if (!IS_LAST_ENTRY(s->first)) { 940 if (!IS_LAST_ENTRY(s->first)) {
942 header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); 941 header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
943 EXT4_I(inode)->i_state |= EXT4_STATE_XATTR; 942 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
944 } else { 943 } else {
945 header->h_magic = cpu_to_le32(0); 944 header->h_magic = cpu_to_le32(0);
946 EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR; 945 ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
947 } 946 }
948 return 0; 947 return 0;
949} 948}
@@ -986,8 +985,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
986 if (strlen(name) > 255) 985 if (strlen(name) > 255)
987 return -ERANGE; 986 return -ERANGE;
988 down_write(&EXT4_I(inode)->xattr_sem); 987 down_write(&EXT4_I(inode)->xattr_sem);
989 no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND; 988 no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
990 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 989 ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
991 990
992 error = ext4_get_inode_loc(inode, &is.iloc); 991 error = ext4_get_inode_loc(inode, &is.iloc);
993 if (error) 992 if (error)
@@ -997,10 +996,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
997 if (error) 996 if (error)
998 goto cleanup; 997 goto cleanup;
999 998
1000 if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { 999 if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
1001 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); 1000 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
1002 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 1001 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
1003 EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW; 1002 ext4_clear_inode_state(inode, EXT4_STATE_NEW);
1004 } 1003 }
1005 1004
1006 error = ext4_xattr_ibody_find(inode, &i, &is); 1005 error = ext4_xattr_ibody_find(inode, &i, &is);
@@ -1052,7 +1051,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1052 ext4_xattr_update_super_block(handle, inode->i_sb); 1051 ext4_xattr_update_super_block(handle, inode->i_sb);
1053 inode->i_ctime = ext4_current_time(inode); 1052 inode->i_ctime = ext4_current_time(inode);
1054 if (!value) 1053 if (!value)
1055 EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; 1054 ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
1056 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); 1055 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
1057 /* 1056 /*
1058 * The bh is consumed by ext4_mark_iloc_dirty, even with 1057 * The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1067,7 +1066,7 @@ cleanup:
1067 brelse(is.iloc.bh); 1066 brelse(is.iloc.bh);
1068 brelse(bs.bh); 1067 brelse(bs.bh);
1069 if (no_expand == 0) 1068 if (no_expand == 0)
1070 EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; 1069 ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
1071 up_write(&EXT4_I(inode)->xattr_sem); 1070 up_write(&EXT4_I(inode)->xattr_sem);
1072 return error; 1071 return error;
1073} 1072}
@@ -1195,9 +1194,8 @@ retry:
1195 if (!bh) 1194 if (!bh)
1196 goto cleanup; 1195 goto cleanup;
1197 if (ext4_xattr_check_block(bh)) { 1196 if (ext4_xattr_check_block(bh)) {
1198 ext4_error(inode->i_sb, __func__, 1197 ext4_error(inode->i_sb, "inode %lu: bad block %llu",
1199 "inode %lu: bad block %llu", inode->i_ino, 1198 inode->i_ino, EXT4_I(inode)->i_file_acl);
1200 EXT4_I(inode)->i_file_acl);
1201 error = -EIO; 1199 error = -EIO;
1202 goto cleanup; 1200 goto cleanup;
1203 } 1201 }
@@ -1302,6 +1300,8 @@ retry:
1302 1300
1303 /* Remove the chosen entry from the inode */ 1301 /* Remove the chosen entry from the inode */
1304 error = ext4_xattr_ibody_set(handle, inode, &i, is); 1302 error = ext4_xattr_ibody_set(handle, inode, &i, is);
1303 if (error)
1304 goto cleanup;
1305 1305
1306 entry = IFIRST(header); 1306 entry = IFIRST(header);
1307 if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) 1307 if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
@@ -1332,6 +1332,8 @@ retry:
1332 goto cleanup; 1332 goto cleanup;
1333 kfree(b_entry_name); 1333 kfree(b_entry_name);
1334 kfree(buffer); 1334 kfree(buffer);
1335 b_entry_name = NULL;
1336 buffer = NULL;
1335 brelse(is->iloc.bh); 1337 brelse(is->iloc.bh);
1336 kfree(is); 1338 kfree(is);
1337 kfree(bs); 1339 kfree(bs);
@@ -1370,16 +1372,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
1370 goto cleanup; 1372 goto cleanup;
1371 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 1373 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1372 if (!bh) { 1374 if (!bh) {
1373 ext4_error(inode->i_sb, __func__, 1375 ext4_error(inode->i_sb, "inode %lu: block %llu read error",
1374 "inode %lu: block %llu read error", inode->i_ino, 1376 inode->i_ino, EXT4_I(inode)->i_file_acl);
1375 EXT4_I(inode)->i_file_acl);
1376 goto cleanup; 1377 goto cleanup;
1377 } 1378 }
1378 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 1379 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
1379 BHDR(bh)->h_blocks != cpu_to_le32(1)) { 1380 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1380 ext4_error(inode->i_sb, __func__, 1381 ext4_error(inode->i_sb, "inode %lu: bad block %llu",
1381 "inode %lu: bad block %llu", inode->i_ino, 1382 inode->i_ino, EXT4_I(inode)->i_file_acl);
1382 EXT4_I(inode)->i_file_acl);
1383 goto cleanup; 1383 goto cleanup;
1384 } 1384 }
1385 ext4_xattr_release_block(handle, inode, bh); 1385 ext4_xattr_release_block(handle, inode, bh);
@@ -1504,7 +1504,7 @@ again:
1504 } 1504 }
1505 bh = sb_bread(inode->i_sb, ce->e_block); 1505 bh = sb_bread(inode->i_sb, ce->e_block);
1506 if (!bh) { 1506 if (!bh) {
1507 ext4_error(inode->i_sb, __func__, 1507 ext4_error(inode->i_sb,
1508 "inode %lu: block %lu read error", 1508 "inode %lu: block %lu read error",
1509 inode->i_ino, (unsigned long) ce->e_block); 1509 inode->i_ino, (unsigned long) ce->e_block);
1510 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= 1510 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 983c253999a7..8b145e98df07 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -7,6 +7,7 @@
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/security.h> 9#include <linux/security.h>
10#include <linux/slab.h>
10#include "ext4_jbd2.h" 11#include "ext4_jbd2.h"
11#include "ext4.h" 12#include "ext4.h"
12#include "xattr.h" 13#include "xattr.h"
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 923990e4f16e..113f0a1e565d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/slab.h>
12#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
13#include "fat.h" 14#include "fat.h"
14 15
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 14da530b05ca..0ce143bd7d56 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -558,7 +558,7 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
558 buf->f_bavail = sbi->free_clusters; 558 buf->f_bavail = sbi->free_clusters;
559 buf->f_fsid.val[0] = (u32)id; 559 buf->f_fsid.val[0] = (u32)id;
560 buf->f_fsid.val[1] = (u32)(id >> 32); 560 buf->f_fsid.val[1] = (u32)(id >> 32);
561 buf->f_namelen = sbi->options.isvfat ? 260 : 12; 561 buf->f_namelen = sbi->options.isvfat ? FAT_LFN_LEN : 12;
562 562
563 return 0; 563 return 0;
564} 564}
@@ -577,7 +577,7 @@ static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
577 return i_pos; 577 return i_pos;
578} 578}
579 579
580static int fat_write_inode(struct inode *inode, int wait) 580static int __fat_write_inode(struct inode *inode, int wait)
581{ 581{
582 struct super_block *sb = inode->i_sb; 582 struct super_block *sb = inode->i_sb;
583 struct msdos_sb_info *sbi = MSDOS_SB(sb); 583 struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -634,9 +634,14 @@ retry:
634 return err; 634 return err;
635} 635}
636 636
637static int fat_write_inode(struct inode *inode, struct writeback_control *wbc)
638{
639 return __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
640}
641
637int fat_sync_inode(struct inode *inode) 642int fat_sync_inode(struct inode *inode)
638{ 643{
639 return fat_write_inode(inode, 1); 644 return __fat_write_inode(inode, 1);
640} 645}
641 646
642EXPORT_SYMBOL_GPL(fat_sync_inode); 647EXPORT_SYMBOL_GPL(fat_sync_inode);
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f565f24019b5..6fcc7e71fbaa 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -309,7 +309,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
309{ 309{
310 struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options; 310 struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
311 wchar_t *ip, *ext_start, *end, *name_start; 311 wchar_t *ip, *ext_start, *end, *name_start;
312 unsigned char base[9], ext[4], buf[8], *p; 312 unsigned char base[9], ext[4], buf[5], *p;
313 unsigned char charbuf[NLS_MAX_CHARSET_SIZE]; 313 unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
314 int chl, chi; 314 int chl, chi;
315 int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen; 315 int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
@@ -467,7 +467,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
467 return 0; 467 return 0;
468 } 468 }
469 469
470 i = jiffies & 0xffff; 470 i = jiffies;
471 sz = (jiffies >> 16) & 0x7; 471 sz = (jiffies >> 16) & 0x7;
472 if (baselen > 2) { 472 if (baselen > 2) {
473 baselen = numtail2_baselen; 473 baselen = numtail2_baselen;
@@ -476,7 +476,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
476 name_res[baselen + 4] = '~'; 476 name_res[baselen + 4] = '~';
477 name_res[baselen + 5] = '1' + sz; 477 name_res[baselen + 5] = '1' + sz;
478 while (1) { 478 while (1) {
479 sprintf(buf, "%04X", i); 479 snprintf(buf, sizeof(buf), "%04X", i & 0xffff);
480 memcpy(&name_res[baselen], buf, 4); 480 memcpy(&name_res[baselen], buf, 4);
481 if (vfat_find_form(dir, name_res) < 0) 481 if (vfat_find_form(dir, name_res) < 0)
482 break; 482 break;
@@ -502,14 +502,14 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
502 *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname); 502 *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
503 if (*outlen < 0) 503 if (*outlen < 0)
504 return *outlen; 504 return *outlen;
505 else if (*outlen > 255) 505 else if (*outlen > FAT_LFN_LEN)
506 return -ENAMETOOLONG; 506 return -ENAMETOOLONG;
507 507
508 op = &outname[*outlen * sizeof(wchar_t)]; 508 op = &outname[*outlen * sizeof(wchar_t)];
509 } else { 509 } else {
510 if (nls) { 510 if (nls) {
511 for (i = 0, ip = name, op = outname, *outlen = 0; 511 for (i = 0, ip = name, op = outname, *outlen = 0;
512 i < len && *outlen <= 255; 512 i < len && *outlen <= FAT_LFN_LEN;
513 *outlen += 1) 513 *outlen += 1)
514 { 514 {
515 if (escape && (*ip == ':')) { 515 if (escape && (*ip == ':')) {
@@ -549,7 +549,7 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
549 return -ENAMETOOLONG; 549 return -ENAMETOOLONG;
550 } else { 550 } else {
551 for (i = 0, ip = name, op = outname, *outlen = 0; 551 for (i = 0, ip = name, op = outname, *outlen = 0;
552 i < len && *outlen <= 255; 552 i < len && *outlen <= FAT_LFN_LEN;
553 i++, *outlen += 1) 553 i++, *outlen += 1)
554 { 554 {
555 *op++ = *ip++; 555 *op++ = *ip++;
@@ -701,6 +701,15 @@ static int vfat_find(struct inode *dir, struct qstr *qname,
701 return fat_search_long(dir, qname->name, len, sinfo); 701 return fat_search_long(dir, qname->name, len, sinfo);
702} 702}
703 703
704/*
705 * (nfsd's) anonymous disconnected dentry?
706 * NOTE: !IS_ROOT() is not anonymous (I.e. d_splice_alias() did the job).
707 */
708static int vfat_d_anon_disconn(struct dentry *dentry)
709{
710 return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED);
711}
712
704static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, 713static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
705 struct nameidata *nd) 714 struct nameidata *nd)
706{ 715{
@@ -729,11 +738,11 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
729 } 738 }
730 739
731 alias = d_find_alias(inode); 740 alias = d_find_alias(inode);
732 if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) { 741 if (alias && !vfat_d_anon_disconn(alias)) {
733 /* 742 /*
734 * This inode has non DCACHE_DISCONNECTED dentry. This 743 * This inode has non anonymous-DCACHE_DISCONNECTED
735 * means, the user did ->lookup() by an another name 744 * dentry. This means, the user did ->lookup() by an
736 * (longname vs 8.3 alias of it) in past. 745 * another name (longname vs 8.3 alias of it) in past.
737 * 746 *
738 * Switch to new one for reason of locality if possible. 747 * Switch to new one for reason of locality if possible.
739 */ 748 */
@@ -743,7 +752,9 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
743 iput(inode); 752 iput(inode);
744 unlock_super(sb); 753 unlock_super(sb);
745 return alias; 754 return alias;
746 } 755 } else
756 dput(alias);
757
747out: 758out:
748 unlock_super(sb); 759 unlock_super(sb);
749 dentry->d_op = sb->s_root->d_op; 760 dentry->d_op = sb->s_root->d_op;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 2cf93ec40a67..452d02f9075e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -344,7 +344,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
344 switch (cmd) { 344 switch (cmd) {
345 case F_DUPFD: 345 case F_DUPFD:
346 case F_DUPFD_CLOEXEC: 346 case F_DUPFD_CLOEXEC:
347 if (arg >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) 347 if (arg >= rlimit(RLIMIT_NOFILE))
348 break; 348 break;
349 err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0); 349 err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0);
350 if (err >= 0) { 350 if (err >= 0) {
@@ -618,60 +618,90 @@ static DEFINE_RWLOCK(fasync_lock);
618static struct kmem_cache *fasync_cache __read_mostly; 618static struct kmem_cache *fasync_cache __read_mostly;
619 619
620/* 620/*
621 * fasync_helper() is used by almost all character device drivers 621 * Remove a fasync entry. If successfully removed, return
622 * to set up the fasync queue. It returns negative on error, 0 if it did 622 * positive and clear the FASYNC flag. If no entry exists,
623 * no changes and positive if it added/deleted the entry. 623 * do nothing and return 0.
624 *
625 * NOTE! It is very important that the FASYNC flag always
626 * match the state "is the filp on a fasync list".
627 *
628 * We always take the 'filp->f_lock', in since fasync_lock
629 * needs to be irq-safe.
624 */ 630 */
625int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp) 631static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
626{ 632{
627 struct fasync_struct *fa, **fp; 633 struct fasync_struct *fa, **fp;
628 struct fasync_struct *new = NULL;
629 int result = 0; 634 int result = 0;
630 635
631 if (on) { 636 spin_lock(&filp->f_lock);
632 new = kmem_cache_alloc(fasync_cache, GFP_KERNEL); 637 write_lock_irq(&fasync_lock);
633 if (!new) 638 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
634 return -ENOMEM; 639 if (fa->fa_file != filp)
640 continue;
641 *fp = fa->fa_next;
642 kmem_cache_free(fasync_cache, fa);
643 filp->f_flags &= ~FASYNC;
644 result = 1;
645 break;
635 } 646 }
647 write_unlock_irq(&fasync_lock);
648 spin_unlock(&filp->f_lock);
649 return result;
650}
651
652/*
653 * Add a fasync entry. Return negative on error, positive if
654 * added, and zero if did nothing but change an existing one.
655 *
656 * NOTE! It is very important that the FASYNC flag always
657 * match the state "is the filp on a fasync list".
658 */
659static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
660{
661 struct fasync_struct *new, *fa, **fp;
662 int result = 0;
663
664 new = kmem_cache_alloc(fasync_cache, GFP_KERNEL);
665 if (!new)
666 return -ENOMEM;
636 667
637 /*
638 * We need to take f_lock first since it's not an IRQ-safe
639 * lock.
640 */
641 spin_lock(&filp->f_lock); 668 spin_lock(&filp->f_lock);
642 write_lock_irq(&fasync_lock); 669 write_lock_irq(&fasync_lock);
643 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { 670 for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
644 if (fa->fa_file == filp) { 671 if (fa->fa_file != filp)
645 if(on) { 672 continue;
646 fa->fa_fd = fd; 673 fa->fa_fd = fd;
647 kmem_cache_free(fasync_cache, new); 674 kmem_cache_free(fasync_cache, new);
648 } else { 675 goto out;
649 *fp = fa->fa_next;
650 kmem_cache_free(fasync_cache, fa);
651 result = 1;
652 }
653 goto out;
654 }
655 } 676 }
656 677
657 if (on) { 678 new->magic = FASYNC_MAGIC;
658 new->magic = FASYNC_MAGIC; 679 new->fa_file = filp;
659 new->fa_file = filp; 680 new->fa_fd = fd;
660 new->fa_fd = fd; 681 new->fa_next = *fapp;
661 new->fa_next = *fapp; 682 *fapp = new;
662 *fapp = new; 683 result = 1;
663 result = 1; 684 filp->f_flags |= FASYNC;
664 } 685
665out: 686out:
666 if (on)
667 filp->f_flags |= FASYNC;
668 else
669 filp->f_flags &= ~FASYNC;
670 write_unlock_irq(&fasync_lock); 687 write_unlock_irq(&fasync_lock);
671 spin_unlock(&filp->f_lock); 688 spin_unlock(&filp->f_lock);
672 return result; 689 return result;
673} 690}
674 691
692/*
693 * fasync_helper() is used by almost all character device drivers
694 * to set up the fasync queue, and for regular files by the file
695 * lease code. It returns negative on error, 0 if it did no changes
696 * and positive if it added/deleted the entry.
697 */
698int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
699{
700 if (!on)
701 return fasync_remove_entry(filp, fapp);
702 return fasync_add_entry(fd, filp, fapp);
703}
704
675EXPORT_SYMBOL(fasync_helper); 705EXPORT_SYMBOL(fasync_helper);
676 706
677void __kill_fasync(struct fasync_struct *fa, int sig, int band) 707void __kill_fasync(struct fasync_struct *fa, int sig, int band)
diff --git a/fs/fifo.c b/fs/fifo.c
index f8f97b8b6d44..5d6606ffc2d2 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/sched.h> 14#include <linux/sched.h>
16#include <linux/pipe_fs_i.h> 15#include <linux/pipe_fs_i.h>
diff --git a/fs/file.c b/fs/file.c
index 87e129030ab1..34bb7f71d994 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -257,7 +257,7 @@ int expand_files(struct files_struct *files, int nr)
257 * N.B. For clone tasks sharing a files structure, this test 257 * N.B. For clone tasks sharing a files structure, this test
258 * will limit the total number of files that can be opened. 258 * will limit the total number of files that can be opened.
259 */ 259 */
260 if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) 260 if (nr >= rlimit(RLIMIT_NOFILE))
261 return -EMFILE; 261 return -EMFILE;
262 262
263 /* Do we need to expand? */ 263 /* Do we need to expand? */
@@ -478,7 +478,7 @@ repeat:
478 error = fd; 478 error = fd;
479#if 1 479#if 1
480 /* Sanity check */ 480 /* Sanity check */
481 if (rcu_dereference(fdt->fd[fd]) != NULL) { 481 if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
482 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); 482 printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
483 rcu_assign_pointer(fdt->fd[fd], NULL); 483 rcu_assign_pointer(fdt->fd[fd], NULL);
484 } 484 }
diff --git a/fs/file_table.c b/fs/file_table.c
index 69652c5bd5f0..32d12b78bac8 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -253,6 +253,7 @@ void __fput(struct file *file)
253 if (file->f_op && file->f_op->release) 253 if (file->f_op && file->f_op->release)
254 file->f_op->release(inode, file); 254 file->f_op->release(inode, file);
255 security_file_free(file); 255 security_file_free(file);
256 ima_file_free(file);
256 if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) 257 if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
257 cdev_put(inode->i_cdev); 258 cdev_put(inode->i_cdev);
258 fops_put(file->f_op); 259 fops_put(file->f_op);
@@ -392,7 +393,9 @@ retry:
392 continue; 393 continue;
393 if (!(f->f_mode & FMODE_WRITE)) 394 if (!(f->f_mode & FMODE_WRITE))
394 continue; 395 continue;
396 spin_lock(&f->f_lock);
395 f->f_mode &= ~FMODE_WRITE; 397 f->f_mode &= ~FMODE_WRITE;
398 spin_unlock(&f->f_lock);
396 if (file_check_writeable(f) != 0) 399 if (file_check_writeable(f) != 0)
397 continue; 400 continue;
398 file_release_write(f); 401 file_release_write(f);
diff --git a/fs/filesystems.c b/fs/filesystems.c
index a24c58e181db..68ba492d8eef 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -10,10 +10,10 @@
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/slab.h>
14#include <linux/kmod.h> 13#include <linux/kmod.h>
15#include <linux/init.h> 14#include <linux/init.h>
16#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18 18
19/* 19/*
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index ed8f0b0dd880..1429f3ae1e86 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -33,7 +33,6 @@
33#include <linux/fs.h> 33#include <linux/fs.h>
34#include <linux/buffer_head.h> 34#include <linux/buffer_head.h>
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/slab.h>
37#include <linux/pagemap.h> 36#include <linux/pagemap.h>
38 37
39#include "vxfs_extern.h" 38#include "vxfs_extern.h"
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 49bc1b8e8f19..4b37f7cea4dd 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -16,6 +16,7 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/spinlock.h> 18#include <linux/spinlock.h>
19#include <linux/slab.h>
19#include <linux/sched.h> 20#include <linux/sched.h>
20#include <linux/fs.h> 21#include <linux/fs.h>
21#include <linux/mm.h> 22#include <linux/mm.h>
@@ -242,6 +243,7 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
242/** 243/**
243 * bdi_start_writeback - start writeback 244 * bdi_start_writeback - start writeback
244 * @bdi: the backing device to write from 245 * @bdi: the backing device to write from
246 * @sb: write inodes from this super_block
245 * @nr_pages: the number of pages to write 247 * @nr_pages: the number of pages to write
246 * 248 *
247 * Description: 249 * Description:
@@ -380,10 +382,10 @@ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
380 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 382 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
381} 383}
382 384
383static int write_inode(struct inode *inode, int sync) 385static int write_inode(struct inode *inode, struct writeback_control *wbc)
384{ 386{
385 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 387 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
386 return inode->i_sb->s_op->write_inode(inode, sync); 388 return inode->i_sb->s_op->write_inode(inode, wbc);
387 return 0; 389 return 0;
388} 390}
389 391
@@ -420,7 +422,6 @@ static int
420writeback_single_inode(struct inode *inode, struct writeback_control *wbc) 422writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
421{ 423{
422 struct address_space *mapping = inode->i_mapping; 424 struct address_space *mapping = inode->i_mapping;
423 int wait = wbc->sync_mode == WB_SYNC_ALL;
424 unsigned dirty; 425 unsigned dirty;
425 int ret; 426 int ret;
426 427
@@ -438,7 +439,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
438 * We'll have another go at writing back this inode when we 439 * We'll have another go at writing back this inode when we
439 * completed a full scan of b_io. 440 * completed a full scan of b_io.
440 */ 441 */
441 if (!wait) { 442 if (wbc->sync_mode != WB_SYNC_ALL) {
442 requeue_io(inode); 443 requeue_io(inode);
443 return 0; 444 return 0;
444 } 445 }
@@ -460,15 +461,20 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
460 461
461 ret = do_writepages(mapping, wbc); 462 ret = do_writepages(mapping, wbc);
462 463
463 /* Don't write the inode if only I_DIRTY_PAGES was set */ 464 /*
464 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 465 * Make sure to wait on the data before writing out the metadata.
465 int err = write_inode(inode, wait); 466 * This is important for filesystems that modify metadata on data
467 * I/O completion.
468 */
469 if (wbc->sync_mode == WB_SYNC_ALL) {
470 int err = filemap_fdatawait(mapping);
466 if (ret == 0) 471 if (ret == 0)
467 ret = err; 472 ret = err;
468 } 473 }
469 474
470 if (wait) { 475 /* Don't write the inode if only I_DIRTY_PAGES was set */
471 int err = filemap_fdatawait(mapping); 476 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
477 int err = write_inode(inode, wbc);
472 if (ret == 0) 478 if (ret == 0)
473 ret = err; 479 ret = err;
474 } 480 }
@@ -548,108 +554,85 @@ select_queue:
548 return ret; 554 return ret;
549} 555}
550 556
551static void unpin_sb_for_writeback(struct super_block **psb) 557static void unpin_sb_for_writeback(struct super_block *sb)
552{ 558{
553 struct super_block *sb = *psb; 559 up_read(&sb->s_umount);
554 560 put_super(sb);
555 if (sb) {
556 up_read(&sb->s_umount);
557 put_super(sb);
558 *psb = NULL;
559 }
560} 561}
561 562
563enum sb_pin_state {
564 SB_PINNED,
565 SB_NOT_PINNED,
566 SB_PIN_FAILED
567};
568
562/* 569/*
563 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned 570 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
564 * before calling writeback. So make sure that we do pin it, so it doesn't 571 * before calling writeback. So make sure that we do pin it, so it doesn't
565 * go away while we are writing inodes from it. 572 * go away while we are writing inodes from it.
566 *
567 * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
568 * 1 if we failed.
569 */ 573 */
570static int pin_sb_for_writeback(struct writeback_control *wbc, 574static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
571 struct inode *inode, struct super_block **psb) 575 struct super_block *sb)
572{ 576{
573 struct super_block *sb = inode->i_sb;
574
575 /*
576 * If this sb is already pinned, nothing more to do. If not and
577 * *psb is non-NULL, unpin the old one first
578 */
579 if (sb == *psb)
580 return 0;
581 else if (*psb)
582 unpin_sb_for_writeback(psb);
583
584 /* 577 /*
585 * Caller must already hold the ref for this 578 * Caller must already hold the ref for this
586 */ 579 */
587 if (wbc->sync_mode == WB_SYNC_ALL) { 580 if (wbc->sync_mode == WB_SYNC_ALL) {
588 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 581 WARN_ON(!rwsem_is_locked(&sb->s_umount));
589 return 0; 582 return SB_NOT_PINNED;
590 } 583 }
591
592 spin_lock(&sb_lock); 584 spin_lock(&sb_lock);
593 sb->s_count++; 585 sb->s_count++;
594 if (down_read_trylock(&sb->s_umount)) { 586 if (down_read_trylock(&sb->s_umount)) {
595 if (sb->s_root) { 587 if (sb->s_root) {
596 spin_unlock(&sb_lock); 588 spin_unlock(&sb_lock);
597 goto pinned; 589 return SB_PINNED;
598 } 590 }
599 /* 591 /*
600 * umounted, drop rwsem again and fall through to failure 592 * umounted, drop rwsem again and fall through to failure
601 */ 593 */
602 up_read(&sb->s_umount); 594 up_read(&sb->s_umount);
603 } 595 }
604
605 sb->s_count--; 596 sb->s_count--;
606 spin_unlock(&sb_lock); 597 spin_unlock(&sb_lock);
607 return 1; 598 return SB_PIN_FAILED;
608pinned:
609 *psb = sb;
610 return 0;
611} 599}
612 600
613static void writeback_inodes_wb(struct bdi_writeback *wb, 601/*
614 struct writeback_control *wbc) 602 * Write a portion of b_io inodes which belong to @sb.
603 * If @wbc->sb != NULL, then find and write all such
604 * inodes. Otherwise write only ones which go sequentially
605 * in reverse order.
606 * Return 1, if the caller writeback routine should be
607 * interrupted. Otherwise return 0.
608 */
609static int writeback_sb_inodes(struct super_block *sb,
610 struct bdi_writeback *wb,
611 struct writeback_control *wbc)
615{ 612{
616 struct super_block *sb = wbc->sb, *pin_sb = NULL;
617 const unsigned long start = jiffies; /* livelock avoidance */
618
619 spin_lock(&inode_lock);
620
621 if (!wbc->for_kupdate || list_empty(&wb->b_io))
622 queue_io(wb, wbc->older_than_this);
623
624 while (!list_empty(&wb->b_io)) { 613 while (!list_empty(&wb->b_io)) {
625 struct inode *inode = list_entry(wb->b_io.prev,
626 struct inode, i_list);
627 long pages_skipped; 614 long pages_skipped;
628 615 struct inode *inode = list_entry(wb->b_io.prev,
629 /* 616 struct inode, i_list);
630 * super block given and doesn't match, skip this inode 617 if (wbc->sb && sb != inode->i_sb) {
631 */ 618 /* super block given and doesn't
632 if (sb && sb != inode->i_sb) { 619 match, skip this inode */
633 redirty_tail(inode); 620 redirty_tail(inode);
634 continue; 621 continue;
635 } 622 }
636 623 if (sb != inode->i_sb)
624 /* finish with this superblock */
625 return 0;
637 if (inode->i_state & (I_NEW | I_WILL_FREE)) { 626 if (inode->i_state & (I_NEW | I_WILL_FREE)) {
638 requeue_io(inode); 627 requeue_io(inode);
639 continue; 628 continue;
640 } 629 }
641
642 /* 630 /*
643 * Was this inode dirtied after sync_sb_inodes was called? 631 * Was this inode dirtied after sync_sb_inodes was called?
644 * This keeps sync from extra jobs and livelock. 632 * This keeps sync from extra jobs and livelock.
645 */ 633 */
646 if (inode_dirtied_after(inode, start)) 634 if (inode_dirtied_after(inode, wbc->wb_start))
647 break; 635 return 1;
648
649 if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
650 requeue_io(inode);
651 continue;
652 }
653 636
654 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); 637 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
655 __iget(inode); 638 __iget(inode);
@@ -668,14 +651,50 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
668 spin_lock(&inode_lock); 651 spin_lock(&inode_lock);
669 if (wbc->nr_to_write <= 0) { 652 if (wbc->nr_to_write <= 0) {
670 wbc->more_io = 1; 653 wbc->more_io = 1;
671 break; 654 return 1;
672 } 655 }
673 if (!list_empty(&wb->b_more_io)) 656 if (!list_empty(&wb->b_more_io))
674 wbc->more_io = 1; 657 wbc->more_io = 1;
675 } 658 }
659 /* b_io is empty */
660 return 1;
661}
662
663static void writeback_inodes_wb(struct bdi_writeback *wb,
664 struct writeback_control *wbc)
665{
666 int ret = 0;
676 667
677 unpin_sb_for_writeback(&pin_sb); 668 wbc->wb_start = jiffies; /* livelock avoidance */
669 spin_lock(&inode_lock);
670 if (!wbc->for_kupdate || list_empty(&wb->b_io))
671 queue_io(wb, wbc->older_than_this);
678 672
673 while (!list_empty(&wb->b_io)) {
674 struct inode *inode = list_entry(wb->b_io.prev,
675 struct inode, i_list);
676 struct super_block *sb = inode->i_sb;
677 enum sb_pin_state state;
678
679 if (wbc->sb && sb != wbc->sb) {
680 /* super block given and doesn't
681 match, skip this inode */
682 redirty_tail(inode);
683 continue;
684 }
685 state = pin_sb_for_writeback(wbc, sb);
686
687 if (state == SB_PIN_FAILED) {
688 requeue_io(inode);
689 continue;
690 }
691 ret = writeback_sb_inodes(sb, wb, wbc);
692
693 if (state == SB_PINNED)
694 unpin_sb_for_writeback(sb);
695 if (ret)
696 break;
697 }
679 spin_unlock(&inode_lock); 698 spin_unlock(&inode_lock);
680 /* Leave any unwritten inodes on b_io */ 699 /* Leave any unwritten inodes on b_io */
681} 700}
@@ -1187,6 +1206,23 @@ void writeback_inodes_sb(struct super_block *sb)
1187EXPORT_SYMBOL(writeback_inodes_sb); 1206EXPORT_SYMBOL(writeback_inodes_sb);
1188 1207
1189/** 1208/**
1209 * writeback_inodes_sb_if_idle - start writeback if none underway
1210 * @sb: the superblock
1211 *
1212 * Invoke writeback_inodes_sb if no writeback is currently underway.
1213 * Returns 1 if writeback was started, 0 if not.
1214 */
1215int writeback_inodes_sb_if_idle(struct super_block *sb)
1216{
1217 if (!writeback_in_progress(sb->s_bdi)) {
1218 writeback_inodes_sb(sb);
1219 return 1;
1220 } else
1221 return 0;
1222}
1223EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1224
1225/**
1190 * sync_inodes_sb - sync sb inode pages 1226 * sync_inodes_sb - sync sb inode pages
1191 * @sb: the superblock 1227 * @sb: the superblock
1192 * 1228 *
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 864dac20a242..cc94bb9563f2 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -1,7 +1,6 @@
1 1
2config FSCACHE 2config FSCACHE
3 tristate "General filesystem local caching manager" 3 tristate "General filesystem local caching manager"
4 depends on EXPERIMENTAL
5 select SLOW_WORK 4 select SLOW_WORK
6 help 5 help
7 This option enables a generic filesystem caching manager that can be 6 This option enables a generic filesystem caching manager that can be
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 3221a0c7944e..1e1f286dd70e 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -12,6 +12,7 @@
12#define FSCACHE_DEBUG_LEVEL COOKIE 12#define FSCACHE_DEBUG_LEVEL COOKIE
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/seq_file.h> 14#include <linux/seq_file.h>
15#include <linux/slab.h>
15#include <linux/key.h> 16#include <linux/key.h>
16#include <keys/user-type.h> 17#include <keys/user-type.h>
17#include "internal.h" 18#include "internal.h"
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index e513ac599c8e..0b589a9b4ffc 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -53,7 +53,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
53static void fscache_object_slow_work_put_ref(struct slow_work *); 53static void fscache_object_slow_work_put_ref(struct slow_work *);
54static int fscache_object_slow_work_get_ref(struct slow_work *); 54static int fscache_object_slow_work_get_ref(struct slow_work *);
55static void fscache_object_slow_work_execute(struct slow_work *); 55static void fscache_object_slow_work_execute(struct slow_work *);
56#ifdef CONFIG_SLOW_WORK_PROC 56#ifdef CONFIG_SLOW_WORK_DEBUG
57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *); 57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
58#endif 58#endif
59static void fscache_initialise_object(struct fscache_object *); 59static void fscache_initialise_object(struct fscache_object *);
@@ -69,7 +69,7 @@ const struct slow_work_ops fscache_object_slow_work_ops = {
69 .get_ref = fscache_object_slow_work_get_ref, 69 .get_ref = fscache_object_slow_work_get_ref,
70 .put_ref = fscache_object_slow_work_put_ref, 70 .put_ref = fscache_object_slow_work_put_ref,
71 .execute = fscache_object_slow_work_execute, 71 .execute = fscache_object_slow_work_execute,
72#ifdef CONFIG_SLOW_WORK_PROC 72#ifdef CONFIG_SLOW_WORK_DEBUG
73 .desc = fscache_object_slow_work_desc, 73 .desc = fscache_object_slow_work_desc,
74#endif 74#endif
75}; 75};
@@ -364,7 +364,7 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
364/* 364/*
365 * describe an object for slow-work debugging 365 * describe an object for slow-work debugging
366 */ 366 */
367#ifdef CONFIG_SLOW_WORK_PROC 367#ifdef CONFIG_SLOW_WORK_DEBUG
368static void fscache_object_slow_work_desc(struct slow_work *work, 368static void fscache_object_slow_work_desc(struct slow_work *work,
369 struct seq_file *m) 369 struct seq_file *m)
370{ 370{
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 313e79a14266..f17cecafae44 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -14,6 +14,7 @@
14#define FSCACHE_DEBUG_LEVEL OPERATION 14#define FSCACHE_DEBUG_LEVEL OPERATION
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/seq_file.h> 16#include <linux/seq_file.h>
17#include <linux/slab.h>
17#include "internal.h" 18#include "internal.h"
18 19
19atomic_t fscache_op_debug_id; 20atomic_t fscache_op_debug_id;
@@ -500,7 +501,7 @@ static void fscache_op_execute(struct slow_work *work)
500/* 501/*
501 * describe an operation for slow-work debugging 502 * describe an operation for slow-work debugging
502 */ 503 */
503#ifdef CONFIG_SLOW_WORK_PROC 504#ifdef CONFIG_SLOW_WORK_DEBUG
504static void fscache_op_desc(struct slow_work *work, struct seq_file *m) 505static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
505{ 506{
506 struct fscache_operation *op = 507 struct fscache_operation *op =
@@ -517,7 +518,7 @@ const struct slow_work_ops fscache_op_slow_work_ops = {
517 .get_ref = fscache_op_get_ref, 518 .get_ref = fscache_op_get_ref,
518 .put_ref = fscache_op_put_ref, 519 .put_ref = fscache_op_put_ref,
519 .execute = fscache_op_execute, 520 .execute = fscache_op_execute,
520#ifdef CONFIG_SLOW_WORK_PROC 521#ifdef CONFIG_SLOW_WORK_DEBUG
521 .desc = fscache_op_desc, 522 .desc = fscache_op_desc,
522#endif 523#endif
523}; 524};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c598ea4c4e7d..47aefd376e54 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -14,6 +14,7 @@
14#include <linux/fscache-cache.h> 14#include <linux/fscache-cache.h>
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include <linux/pagevec.h> 16#include <linux/pagevec.h>
17#include <linux/slab.h>
17#include "internal.h" 18#include "internal.h"
18 19
19/* 20/*
@@ -881,6 +882,7 @@ submit_failed:
881 goto nobufs; 882 goto nobufs;
882 883
883nobufs_unlock_obj: 884nobufs_unlock_obj:
885 spin_unlock(&cookie->stores_lock);
884 spin_unlock(&object->lock); 886 spin_unlock(&object->lock);
885nobufs: 887nobufs:
886 spin_unlock(&cookie->lock); 888 spin_unlock(&cookie->lock);
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 46435f3aae68..4765190d537f 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -165,8 +165,8 @@ static int fscache_stats_show(struct seq_file *m, void *v)
165 atomic_read(&fscache_n_object_lookups), 165 atomic_read(&fscache_n_object_lookups),
166 atomic_read(&fscache_n_object_lookups_negative), 166 atomic_read(&fscache_n_object_lookups_negative),
167 atomic_read(&fscache_n_object_lookups_positive), 167 atomic_read(&fscache_n_object_lookups_positive),
168 atomic_read(&fscache_n_object_lookups_timed_out), 168 atomic_read(&fscache_n_object_created),
169 atomic_read(&fscache_n_object_created)); 169 atomic_read(&fscache_n_object_lookups_timed_out));
170 170
171 seq_printf(m, "Updates: n=%u nul=%u run=%u\n", 171 seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
172 atomic_read(&fscache_n_updates), 172 atomic_read(&fscache_n_updates),
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index de792dcf3274..e1f8171278bd 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -44,6 +44,7 @@
44#include <linux/magic.h> 44#include <linux/magic.h>
45#include <linux/miscdevice.h> 45#include <linux/miscdevice.h>
46#include <linux/mutex.h> 46#include <linux/mutex.h>
47#include <linux/slab.h>
47#include <linux/spinlock.h> 48#include <linux/spinlock.h>
48#include <linux/stat.h> 49#include <linux/stat.h>
49 50
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 51d9e33d634f..eb7e9423691f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -865,13 +865,10 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
865 865
866 down_read(&fc->killsb); 866 down_read(&fc->killsb);
867 err = -ENOENT; 867 err = -ENOENT;
868 if (!fc->sb) 868 if (fc->sb) {
869 goto err_unlock; 869 err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
870 870 outarg.off, outarg.len);
871 err = fuse_reverse_inval_inode(fc->sb, outarg.ino, 871 }
872 outarg.off, outarg.len);
873
874err_unlock:
875 up_read(&fc->killsb); 872 up_read(&fc->killsb);
876 return err; 873 return err;
877 874
@@ -884,10 +881,15 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
884 struct fuse_copy_state *cs) 881 struct fuse_copy_state *cs)
885{ 882{
886 struct fuse_notify_inval_entry_out outarg; 883 struct fuse_notify_inval_entry_out outarg;
887 int err = -EINVAL; 884 int err = -ENOMEM;
888 char buf[FUSE_NAME_MAX+1]; 885 char *buf;
889 struct qstr name; 886 struct qstr name;
890 887
888 buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
889 if (!buf)
890 goto err;
891
892 err = -EINVAL;
891 if (size < sizeof(outarg)) 893 if (size < sizeof(outarg))
892 goto err; 894 goto err;
893 895
@@ -910,16 +912,14 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
910 912
911 down_read(&fc->killsb); 913 down_read(&fc->killsb);
912 err = -ENOENT; 914 err = -ENOENT;
913 if (!fc->sb) 915 if (fc->sb)
914 goto err_unlock; 916 err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
915
916 err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
917
918err_unlock:
919 up_read(&fc->killsb); 917 up_read(&fc->killsb);
918 kfree(buf);
920 return err; 919 return err;
921 920
922err: 921err:
922 kfree(buf);
923 fuse_copy_finish(cs); 923 fuse_copy_finish(cs);
924 return err; 924 return err;
925} 925}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c18913a777ae..a9f5e137f1d3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -828,6 +828,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
828 if (!page) 828 if (!page)
829 break; 829 break;
830 830
831 if (mapping_writably_mapped(mapping))
832 flush_dcache_page(page);
833
831 pagefault_disable(); 834 pagefault_disable();
832 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); 835 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
833 pagefault_enable(); 836 pagefault_enable();
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1a822ce2b24b..ec14d19ce501 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -850,7 +850,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
850 req->in.args[0].size = sizeof(*arg); 850 req->in.args[0].size = sizeof(*arg);
851 req->in.args[0].value = arg; 851 req->in.args[0].value = arg;
852 req->out.numargs = 1; 852 req->out.numargs = 1;
853 /* Variable length arguement used for backward compatibility 853 /* Variable length argument used for backward compatibility
854 with interface version < 7.5. Rest of init_out is zeroed 854 with interface version < 7.5. Rest of init_out is zeroed
855 by do_get_request(), so a short reply is not a problem */ 855 by do_get_request(), so a short reply is not a problem */
856 req->out.argvar = 1; 856 req->out.argvar = 1;
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 55458031e501..fe5df5457656 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/gfp.h>
10#include <linux/fs.h> 11#include <linux/fs.h>
11#include <linux/generic_acl.h> 12#include <linux/generic_acl.h>
12#include <linux/posix_acl.h> 13#include <linux/posix_acl.h>
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 4dcddf83326f..a47b43107112 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -8,7 +8,6 @@ config GFS2_FS
8 select FS_POSIX_ACL 8 select FS_POSIX_ACL
9 select CRC32 9 select CRC32
10 select SLOW_WORK 10 select SLOW_WORK
11 select QUOTA
12 select QUOTACTL 11 select QUOTACTL
13 help 12 help
14 A cluster filesystem. 13 A cluster filesystem.
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 7b8da9415267..0c1d0b82dcf1 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1061,8 +1061,8 @@ out:
1061 1061
1062int gfs2_releasepage(struct page *page, gfp_t gfp_mask) 1062int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
1063{ 1063{
1064 struct inode *aspace = page->mapping->host; 1064 struct address_space *mapping = page->mapping;
1065 struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info; 1065 struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
1066 struct buffer_head *bh, *head; 1066 struct buffer_head *bh, *head;
1067 struct gfs2_bufdata *bd; 1067 struct gfs2_bufdata *bd;
1068 1068
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6d47379e794b..5e411d5f4697 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
@@ -541,7 +540,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
541 *ptr++ = cpu_to_be64(bn++); 540 *ptr++ = cpu_to_be64(bn++);
542 break; 541 break;
543 } 542 }
544 } while (state != ALLOC_DATA); 543 } while ((state != ALLOC_DATA) || !dblock);
545 544
546 ip->i_height = height; 545 ip->i_height = height;
547 gfs2_add_inode_blocks(&ip->i_inode, alloced); 546 gfs2_add_inode_blocks(&ip->i_inode, alloced);
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 91beddadd388..bb7907bde3d8 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index d15876e9aa26..c22c21174833 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4eb308aa3234..e6dd2aec6f82 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -569,6 +569,40 @@ static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
569 return ret; 569 return ret;
570} 570}
571 571
572/**
573 * gfs2_file_aio_write - Perform a write to a file
574 * @iocb: The io context
575 * @iov: The data to write
576 * @nr_segs: Number of @iov segments
577 * @pos: The file position
578 *
579 * We have to do a lock/unlock here to refresh the inode size for
580 * O_APPEND writes, otherwise we can land up writing at the wrong
581 * offset. There is still a race, but provided the app is using its
582 * own file locking, this will make O_APPEND work as expected.
583 *
584 */
585
586static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
587 unsigned long nr_segs, loff_t pos)
588{
589 struct file *file = iocb->ki_filp;
590
591 if (file->f_flags & O_APPEND) {
592 struct dentry *dentry = file->f_dentry;
593 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
594 struct gfs2_holder gh;
595 int ret;
596
597 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
598 if (ret)
599 return ret;
600 gfs2_glock_dq_uninit(&gh);
601 }
602
603 return generic_file_aio_write(iocb, iov, nr_segs, pos);
604}
605
572#ifdef CONFIG_GFS2_FS_LOCKING_DLM 606#ifdef CONFIG_GFS2_FS_LOCKING_DLM
573 607
574/** 608/**
@@ -606,7 +640,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
606 640
607 if (!(fl->fl_flags & FL_POSIX)) 641 if (!(fl->fl_flags & FL_POSIX))
608 return -ENOLCK; 642 return -ENOLCK;
609 if (__mandatory_lock(&ip->i_inode)) 643 if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)
610 return -ENOLCK; 644 return -ENOLCK;
611 645
612 if (cmd == F_CANCELLK) { 646 if (cmd == F_CANCELLK) {
@@ -711,7 +745,7 @@ const struct file_operations gfs2_file_fops = {
711 .read = do_sync_read, 745 .read = do_sync_read,
712 .aio_read = generic_file_aio_read, 746 .aio_read = generic_file_aio_read,
713 .write = do_sync_write, 747 .write = do_sync_write,
714 .aio_write = generic_file_aio_write, 748 .aio_write = gfs2_file_aio_write,
715 .unlocked_ioctl = gfs2_ioctl, 749 .unlocked_ioctl = gfs2_ioctl,
716 .mmap = gfs2_mmap, 750 .mmap = gfs2_mmap,
717 .open = gfs2_open, 751 .open = gfs2_open,
@@ -741,7 +775,7 @@ const struct file_operations gfs2_file_fops_nolock = {
741 .read = do_sync_read, 775 .read = do_sync_read,
742 .aio_read = generic_file_aio_read, 776 .aio_read = generic_file_aio_read,
743 .write = do_sync_write, 777 .write = do_sync_write,
744 .aio_write = generic_file_aio_write, 778 .aio_write = gfs2_file_aio_write,
745 .unlocked_ioctl = gfs2_ioctl, 779 .unlocked_ioctl = gfs2_ioctl,
746 .mmap = gfs2_mmap, 780 .mmap = gfs2_mmap,
747 .open = gfs2_open, 781 .open = gfs2_open,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f455a03a09e2..454d4b4eb36b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -19,7 +19,6 @@
19#include <linux/list.h> 19#include <linux/list.h>
20#include <linux/wait.h> 20#include <linux/wait.h>
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/rwsem.h>
23#include <asm/uaccess.h> 22#include <asm/uaccess.h>
24#include <linux/seq_file.h> 23#include <linux/seq_file.h>
25#include <linux/debugfs.h> 24#include <linux/debugfs.h>
@@ -60,7 +59,6 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
60#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0) 59#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
61static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); 60static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
62 61
63static DECLARE_RWSEM(gfs2_umount_flush_sem);
64static struct dentry *gfs2_root; 62static struct dentry *gfs2_root;
65static struct workqueue_struct *glock_workqueue; 63static struct workqueue_struct *glock_workqueue;
66struct workqueue_struct *gfs2_delete_workqueue; 64struct workqueue_struct *gfs2_delete_workqueue;
@@ -154,12 +152,14 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp,
154static void glock_free(struct gfs2_glock *gl) 152static void glock_free(struct gfs2_glock *gl)
155{ 153{
156 struct gfs2_sbd *sdp = gl->gl_sbd; 154 struct gfs2_sbd *sdp = gl->gl_sbd;
157 struct inode *aspace = gl->gl_aspace; 155 struct address_space *mapping = gfs2_glock2aspace(gl);
156 struct kmem_cache *cachep = gfs2_glock_cachep;
158 157
159 if (aspace) 158 GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
160 gfs2_aspace_put(aspace);
161 trace_gfs2_glock_put(gl); 159 trace_gfs2_glock_put(gl);
162 sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl); 160 if (mapping)
161 cachep = gfs2_glock_aspace_cachep;
162 sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl);
163} 163}
164 164
165/** 165/**
@@ -712,7 +712,6 @@ static void glock_work_func(struct work_struct *work)
712 finish_xmote(gl, gl->gl_reply); 712 finish_xmote(gl, gl->gl_reply);
713 drop_ref = 1; 713 drop_ref = 1;
714 } 714 }
715 down_read(&gfs2_umount_flush_sem);
716 spin_lock(&gl->gl_spin); 715 spin_lock(&gl->gl_spin);
717 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && 716 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
718 gl->gl_state != LM_ST_UNLOCKED && 717 gl->gl_state != LM_ST_UNLOCKED &&
@@ -725,7 +724,6 @@ static void glock_work_func(struct work_struct *work)
725 } 724 }
726 run_queue(gl, 0); 725 run_queue(gl, 0);
727 spin_unlock(&gl->gl_spin); 726 spin_unlock(&gl->gl_spin);
728 up_read(&gfs2_umount_flush_sem);
729 if (!delay || 727 if (!delay ||
730 queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) 728 queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
731 gfs2_glock_put(gl); 729 gfs2_glock_put(gl);
@@ -750,10 +748,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
750 const struct gfs2_glock_operations *glops, int create, 748 const struct gfs2_glock_operations *glops, int create,
751 struct gfs2_glock **glp) 749 struct gfs2_glock **glp)
752{ 750{
751 struct super_block *s = sdp->sd_vfs;
753 struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type }; 752 struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
754 struct gfs2_glock *gl, *tmp; 753 struct gfs2_glock *gl, *tmp;
755 unsigned int hash = gl_hash(sdp, &name); 754 unsigned int hash = gl_hash(sdp, &name);
756 int error; 755 struct address_space *mapping;
757 756
758 read_lock(gl_lock_addr(hash)); 757 read_lock(gl_lock_addr(hash));
759 gl = search_bucket(hash, sdp, &name); 758 gl = search_bucket(hash, sdp, &name);
@@ -765,10 +764,14 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
765 if (!create) 764 if (!create)
766 return -ENOENT; 765 return -ENOENT;
767 766
768 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL); 767 if (glops->go_flags & GLOF_ASPACE)
768 gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL);
769 else
770 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
769 if (!gl) 771 if (!gl)
770 return -ENOMEM; 772 return -ENOMEM;
771 773
774 atomic_inc(&sdp->sd_glock_disposal);
772 gl->gl_flags = 0; 775 gl->gl_flags = 0;
773 gl->gl_name = name; 776 gl->gl_name = name;
774 atomic_set(&gl->gl_ref, 1); 777 atomic_set(&gl->gl_ref, 1);
@@ -783,18 +786,18 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
783 gl->gl_tchange = jiffies; 786 gl->gl_tchange = jiffies;
784 gl->gl_object = NULL; 787 gl->gl_object = NULL;
785 gl->gl_sbd = sdp; 788 gl->gl_sbd = sdp;
786 gl->gl_aspace = NULL;
787 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); 789 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
788 INIT_WORK(&gl->gl_delete, delete_work_func); 790 INIT_WORK(&gl->gl_delete, delete_work_func);
789 791
790 /* If this glock protects actual on-disk data or metadata blocks, 792 mapping = gfs2_glock2aspace(gl);
791 create a VFS inode to manage the pages/buffers holding them. */ 793 if (mapping) {
792 if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) { 794 mapping->a_ops = &gfs2_meta_aops;
793 gl->gl_aspace = gfs2_aspace_get(sdp); 795 mapping->host = s->s_bdev->bd_inode;
794 if (!gl->gl_aspace) { 796 mapping->flags = 0;
795 error = -ENOMEM; 797 mapping_set_gfp_mask(mapping, GFP_NOFS);
796 goto fail; 798 mapping->assoc_mapping = NULL;
797 } 799 mapping->backing_dev_info = s->s_bdi;
800 mapping->writeback_index = 0;
798 } 801 }
799 802
800 write_lock(gl_lock_addr(hash)); 803 write_lock(gl_lock_addr(hash));
@@ -811,10 +814,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
811 *glp = gl; 814 *glp = gl;
812 815
813 return 0; 816 return 0;
814
815fail:
816 kmem_cache_free(gfs2_glock_cachep, gl);
817 return error;
818} 817}
819 818
820/** 819/**
@@ -1509,35 +1508,13 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
1509 1508
1510void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) 1509void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
1511{ 1510{
1512 unsigned long t;
1513 unsigned int x; 1511 unsigned int x;
1514 int cont;
1515 1512
1516 t = jiffies; 1513 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1517 1514 examine_bucket(clear_glock, sdp, x);
1518 for (;;) { 1515 flush_workqueue(glock_workqueue);
1519 cont = 0; 1516 wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
1520 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) { 1517 gfs2_dump_lockstate(sdp);
1521 if (examine_bucket(clear_glock, sdp, x))
1522 cont = 1;
1523 }
1524
1525 if (!cont)
1526 break;
1527
1528 if (time_after_eq(jiffies,
1529 t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
1530 fs_warn(sdp, "Unmount seems to be stalled. "
1531 "Dumping lock state...\n");
1532 gfs2_dump_lockstate(sdp);
1533 t = jiffies;
1534 }
1535
1536 down_write(&gfs2_umount_flush_sem);
1537 invalidate_inodes(sdp->sd_vfs);
1538 up_write(&gfs2_umount_flush_sem);
1539 msleep(10);
1540 }
1541} 1518}
1542 1519
1543void gfs2_glock_finish_truncate(struct gfs2_inode *ip) 1520void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
@@ -1681,7 +1658,7 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
1681 dtime *= 1000000/HZ; /* demote time in uSec */ 1658 dtime *= 1000000/HZ; /* demote time in uSec */
1682 if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) 1659 if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
1683 dtime = 0; 1660 dtime = 0;
1684 gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu a:%d r:%d\n", 1661 gfs2_print_dbg(seq, "G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d r:%d\n",
1685 state2str(gl->gl_state), 1662 state2str(gl->gl_state),
1686 gl->gl_name.ln_type, 1663 gl->gl_name.ln_type,
1687 (unsigned long long)gl->gl_name.ln_number, 1664 (unsigned long long)gl->gl_name.ln_number,
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 13f0bd228132..2bda1911b156 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -123,7 +123,7 @@ struct lm_lockops {
123 int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); 123 int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
124 void (*lm_unmount) (struct gfs2_sbd *sdp); 124 void (*lm_unmount) (struct gfs2_sbd *sdp);
125 void (*lm_withdraw) (struct gfs2_sbd *sdp); 125 void (*lm_withdraw) (struct gfs2_sbd *sdp);
126 void (*lm_put_lock) (struct kmem_cache *cachep, void *gl); 126 void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
127 unsigned int (*lm_lock) (struct gfs2_glock *gl, 127 unsigned int (*lm_lock) (struct gfs2_glock *gl,
128 unsigned int req_state, unsigned int flags); 128 unsigned int req_state, unsigned int flags);
129 void (*lm_cancel) (struct gfs2_glock *gl); 129 void (*lm_cancel) (struct gfs2_glock *gl);
@@ -180,6 +180,13 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
180 return gl->gl_state == LM_ST_SHARED; 180 return gl->gl_state == LM_ST_SHARED;
181} 181}
182 182
183static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
184{
185 if (gl->gl_ops->go_flags & GLOF_ASPACE)
186 return (struct address_space *)(gl + 1);
187 return NULL;
188}
189
183int gfs2_glock_get(struct gfs2_sbd *sdp, 190int gfs2_glock_get(struct gfs2_sbd *sdp,
184 u64 number, const struct gfs2_glock_operations *glops, 191 u64 number, const struct gfs2_glock_operations *glops,
185 int create, struct gfs2_glock **glp); 192 int create, struct gfs2_glock **glp);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 78554acc0605..49f97d3bb690 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
@@ -87,7 +86,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
87 86
88static void rgrp_go_sync(struct gfs2_glock *gl) 87static void rgrp_go_sync(struct gfs2_glock *gl)
89{ 88{
90 struct address_space *metamapping = gl->gl_aspace->i_mapping; 89 struct address_space *metamapping = gfs2_glock2aspace(gl);
91 int error; 90 int error;
92 91
93 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) 92 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -113,7 +112,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
113 112
114static void rgrp_go_inval(struct gfs2_glock *gl, int flags) 113static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
115{ 114{
116 struct address_space *mapping = gl->gl_aspace->i_mapping; 115 struct address_space *mapping = gfs2_glock2aspace(gl);
117 116
118 BUG_ON(!(flags & DIO_METADATA)); 117 BUG_ON(!(flags & DIO_METADATA));
119 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); 118 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
@@ -134,7 +133,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
134static void inode_go_sync(struct gfs2_glock *gl) 133static void inode_go_sync(struct gfs2_glock *gl)
135{ 134{
136 struct gfs2_inode *ip = gl->gl_object; 135 struct gfs2_inode *ip = gl->gl_object;
137 struct address_space *metamapping = gl->gl_aspace->i_mapping; 136 struct address_space *metamapping = gfs2_glock2aspace(gl);
138 int error; 137 int error;
139 138
140 if (ip && !S_ISREG(ip->i_inode.i_mode)) 139 if (ip && !S_ISREG(ip->i_inode.i_mode))
@@ -183,7 +182,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
183 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); 182 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
184 183
185 if (flags & DIO_METADATA) { 184 if (flags & DIO_METADATA) {
186 struct address_space *mapping = gl->gl_aspace->i_mapping; 185 struct address_space *mapping = gfs2_glock2aspace(gl);
187 truncate_inode_pages(mapping, 0); 186 truncate_inode_pages(mapping, 0);
188 if (ip) { 187 if (ip) {
189 set_bit(GIF_INVALID, &ip->i_flags); 188 set_bit(GIF_INVALID, &ip->i_flags);
@@ -282,7 +281,8 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
282 281
283static int rgrp_go_demote_ok(const struct gfs2_glock *gl) 282static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
284{ 283{
285 return !gl->gl_aspace->i_mapping->nrpages; 284 const struct address_space *mapping = (const struct address_space *)(gl + 1);
285 return !mapping->nrpages;
286} 286}
287 287
288/** 288/**
@@ -387,8 +387,7 @@ static void iopen_go_callback(struct gfs2_glock *gl)
387 struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; 387 struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
388 388
389 if (gl->gl_demote_state == LM_ST_UNLOCKED && 389 if (gl->gl_demote_state == LM_ST_UNLOCKED &&
390 gl->gl_state == LM_ST_SHARED && 390 gl->gl_state == LM_ST_SHARED && ip) {
391 ip && test_bit(GIF_USER, &ip->i_flags)) {
392 gfs2_glock_hold(gl); 391 gfs2_glock_hold(gl);
393 if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) 392 if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
394 gfs2_glock_put_nolock(gl); 393 gfs2_glock_put_nolock(gl);
@@ -407,6 +406,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
407 .go_dump = inode_go_dump, 406 .go_dump = inode_go_dump,
408 .go_type = LM_TYPE_INODE, 407 .go_type = LM_TYPE_INODE,
409 .go_min_hold_time = HZ / 5, 408 .go_min_hold_time = HZ / 5,
409 .go_flags = GLOF_ASPACE,
410}; 410};
411 411
412const struct gfs2_glock_operations gfs2_rgrp_glops = { 412const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -418,6 +418,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
418 .go_dump = gfs2_rgrp_dump, 418 .go_dump = gfs2_rgrp_dump,
419 .go_type = LM_TYPE_RGRP, 419 .go_type = LM_TYPE_RGRP,
420 .go_min_hold_time = HZ / 5, 420 .go_min_hold_time = HZ / 5,
421 .go_flags = GLOF_ASPACE,
421}; 422};
422 423
423const struct gfs2_glock_operations gfs2_trans_glops = { 424const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 4792200978c8..3aac46f6853e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -162,6 +162,8 @@ struct gfs2_glock_operations {
162 void (*go_callback) (struct gfs2_glock *gl); 162 void (*go_callback) (struct gfs2_glock *gl);
163 const int go_type; 163 const int go_type;
164 const unsigned long go_min_hold_time; 164 const unsigned long go_min_hold_time;
165 const unsigned long go_flags;
166#define GLOF_ASPACE 1
165}; 167};
166 168
167enum { 169enum {
@@ -225,7 +227,6 @@ struct gfs2_glock {
225 227
226 struct gfs2_sbd *gl_sbd; 228 struct gfs2_sbd *gl_sbd;
227 229
228 struct inode *gl_aspace;
229 struct list_head gl_ail_list; 230 struct list_head gl_ail_list;
230 atomic_t gl_ail_count; 231 atomic_t gl_ail_count;
231 struct delayed_work gl_work; 232 struct delayed_work gl_work;
@@ -258,7 +259,6 @@ enum {
258 GIF_INVALID = 0, 259 GIF_INVALID = 0,
259 GIF_QD_LOCKED = 1, 260 GIF_QD_LOCKED = 1,
260 GIF_SW_PAGED = 3, 261 GIF_SW_PAGED = 3,
261 GIF_USER = 4, /* user inode, not metadata addr space */
262}; 262};
263 263
264 264
@@ -451,7 +451,6 @@ struct gfs2_tune {
451 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ 451 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
452 unsigned int gt_new_files_jdata; 452 unsigned int gt_new_files_jdata;
453 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ 453 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
454 unsigned int gt_stall_secs; /* Detects trouble! */
455 unsigned int gt_complain_secs; 454 unsigned int gt_complain_secs;
456 unsigned int gt_statfs_quantum; 455 unsigned int gt_statfs_quantum;
457 unsigned int gt_statfs_slow; 456 unsigned int gt_statfs_slow;
@@ -544,6 +543,8 @@ struct gfs2_sbd {
544 struct gfs2_holder sd_live_gh; 543 struct gfs2_holder sd_live_gh;
545 struct gfs2_glock *sd_rename_gl; 544 struct gfs2_glock *sd_rename_gl;
546 struct gfs2_glock *sd_trans_gl; 545 struct gfs2_glock *sd_trans_gl;
546 wait_queue_head_t sd_glock_wait;
547 atomic_t sd_glock_disposal;
547 548
548 /* Inode Stuff */ 549 /* Inode Stuff */
549 550
@@ -615,7 +616,7 @@ struct gfs2_sbd {
615 unsigned int sd_log_blks_reserved; 616 unsigned int sd_log_blks_reserved;
616 unsigned int sd_log_commited_buf; 617 unsigned int sd_log_commited_buf;
617 unsigned int sd_log_commited_databuf; 618 unsigned int sd_log_commited_databuf;
618 unsigned int sd_log_commited_revoke; 619 int sd_log_commited_revoke;
619 620
620 unsigned int sd_log_num_buf; 621 unsigned int sd_log_num_buf;
621 unsigned int sd_log_num_revoke; 622 unsigned int sd_log_num_revoke;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6e220f4eee7d..b1bf2694fb2b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -45,7 +45,7 @@ static int iget_test(struct inode *inode, void *opaque)
45 struct gfs2_inode *ip = GFS2_I(inode); 45 struct gfs2_inode *ip = GFS2_I(inode);
46 u64 *no_addr = opaque; 46 u64 *no_addr = opaque;
47 47
48 if (ip->i_no_addr == *no_addr && test_bit(GIF_USER, &ip->i_flags)) 48 if (ip->i_no_addr == *no_addr)
49 return 1; 49 return 1;
50 50
51 return 0; 51 return 0;
@@ -58,7 +58,6 @@ static int iget_set(struct inode *inode, void *opaque)
58 58
59 inode->i_ino = (unsigned long)*no_addr; 59 inode->i_ino = (unsigned long)*no_addr;
60 ip->i_no_addr = *no_addr; 60 ip->i_no_addr = *no_addr;
61 set_bit(GIF_USER, &ip->i_flags);
62 return 0; 61 return 0;
63} 62}
64 63
@@ -84,7 +83,7 @@ static int iget_skip_test(struct inode *inode, void *opaque)
84 struct gfs2_inode *ip = GFS2_I(inode); 83 struct gfs2_inode *ip = GFS2_I(inode);
85 struct gfs2_skip_data *data = opaque; 84 struct gfs2_skip_data *data = opaque;
86 85
87 if (ip->i_no_addr == data->no_addr && test_bit(GIF_USER, &ip->i_flags)){ 86 if (ip->i_no_addr == data->no_addr) {
88 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){ 87 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){
89 data->skipped = 1; 88 data->skipped = 1;
90 return 0; 89 return 0;
@@ -103,7 +102,6 @@ static int iget_skip_set(struct inode *inode, void *opaque)
103 return 1; 102 return 1;
104 inode->i_ino = (unsigned long)(data->no_addr); 103 inode->i_ino = (unsigned long)(data->no_addr);
105 ip->i_no_addr = data->no_addr; 104 ip->i_no_addr = data->no_addr;
106 set_bit(GIF_USER, &ip->i_flags);
107 return 0; 105 return 0;
108} 106}
109 107
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 46df988323bc..0e0470ed34c2 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/dlm.h> 11#include <linux/dlm.h>
12#include <linux/slab.h>
12#include <linux/types.h> 13#include <linux/types.h>
13#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
14 15
@@ -21,6 +22,7 @@ static void gdlm_ast(void *arg)
21{ 22{
22 struct gfs2_glock *gl = arg; 23 struct gfs2_glock *gl = arg;
23 unsigned ret = gl->gl_state; 24 unsigned ret = gl->gl_state;
25 struct gfs2_sbd *sdp = gl->gl_sbd;
24 26
25 BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); 27 BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
26 28
@@ -29,7 +31,12 @@ static void gdlm_ast(void *arg)
29 31
30 switch (gl->gl_lksb.sb_status) { 32 switch (gl->gl_lksb.sb_status) {
31 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ 33 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
32 kmem_cache_free(gfs2_glock_cachep, gl); 34 if (gl->gl_ops->go_flags & GLOF_ASPACE)
35 kmem_cache_free(gfs2_glock_aspace_cachep, gl);
36 else
37 kmem_cache_free(gfs2_glock_cachep, gl);
38 if (atomic_dec_and_test(&sdp->sd_glock_disposal))
39 wake_up(&sdp->sd_glock_wait);
33 return; 40 return;
34 case -DLM_ECANCEL: /* Cancel while getting lock */ 41 case -DLM_ECANCEL: /* Cancel while getting lock */
35 ret |= LM_OUT_CANCELED; 42 ret |= LM_OUT_CANCELED;
@@ -164,14 +171,16 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
164 return LM_OUT_ASYNC; 171 return LM_OUT_ASYNC;
165} 172}
166 173
167static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr) 174static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
168{ 175{
169 struct gfs2_glock *gl = ptr; 176 struct gfs2_sbd *sdp = gl->gl_sbd;
170 struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; 177 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
171 int error; 178 int error;
172 179
173 if (gl->gl_lksb.sb_lkid == 0) { 180 if (gl->gl_lksb.sb_lkid == 0) {
174 kmem_cache_free(cachep, gl); 181 kmem_cache_free(cachep, gl);
182 if (atomic_dec_and_test(&sdp->sd_glock_disposal))
183 wake_up(&sdp->sd_glock_wait);
175 return; 184 return;
176 } 185 }
177 186
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 4511b08fc451..e5bf4b59d46e 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -417,7 +417,7 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp)
417 databufhdrs_needed = (sdp->sd_log_commited_databuf + 417 databufhdrs_needed = (sdp->sd_log_commited_databuf +
418 (dbuf_limit - 1)) / dbuf_limit; 418 (dbuf_limit - 1)) / dbuf_limit;
419 419
420 if (sdp->sd_log_commited_revoke) 420 if (sdp->sd_log_commited_revoke > 0)
421 revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, 421 revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
422 sizeof(u64)); 422 sizeof(u64));
423 423
@@ -790,7 +790,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
790 gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) || 790 gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) ||
791 (((int)sdp->sd_log_commited_databuf) >= 0)); 791 (((int)sdp->sd_log_commited_databuf) >= 0));
792 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; 792 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
793 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
794 reserved = calc_reserved(sdp); 793 reserved = calc_reserved(sdp);
795 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved); 794 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
796 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; 795 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index de97632ba32f..adc260fbea90 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -528,9 +528,9 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
528 gfs2_pin(sdp, bd->bd_bh); 528 gfs2_pin(sdp, bd->bd_bh);
529 tr->tr_num_databuf_new++; 529 tr->tr_num_databuf_new++;
530 sdp->sd_log_num_databuf++; 530 sdp->sd_log_num_databuf++;
531 list_add(&le->le_list, &sdp->sd_log_le_databuf); 531 list_add_tail(&le->le_list, &sdp->sd_log_le_databuf);
532 } else { 532 } else {
533 list_add(&le->le_list, &sdp->sd_log_le_ordered); 533 list_add_tail(&le->le_list, &sdp->sd_log_le_ordered);
534 } 534 }
535out: 535out:
536 gfs2_log_unlock(sdp); 536 gfs2_log_unlock(sdp);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 5b31f7741a8f..a88fadc704bb 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -52,6 +52,22 @@ static void gfs2_init_glock_once(void *foo)
52 atomic_set(&gl->gl_ail_count, 0); 52 atomic_set(&gl->gl_ail_count, 0);
53} 53}
54 54
55static void gfs2_init_gl_aspace_once(void *foo)
56{
57 struct gfs2_glock *gl = foo;
58 struct address_space *mapping = (struct address_space *)(gl + 1);
59
60 gfs2_init_glock_once(gl);
61 memset(mapping, 0, sizeof(*mapping));
62 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
63 spin_lock_init(&mapping->tree_lock);
64 spin_lock_init(&mapping->i_mmap_lock);
65 INIT_LIST_HEAD(&mapping->private_list);
66 spin_lock_init(&mapping->private_lock);
67 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
68 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
69}
70
55/** 71/**
56 * init_gfs2_fs - Register GFS2 as a filesystem 72 * init_gfs2_fs - Register GFS2 as a filesystem
57 * 73 *
@@ -78,6 +94,14 @@ static int __init init_gfs2_fs(void)
78 if (!gfs2_glock_cachep) 94 if (!gfs2_glock_cachep)
79 goto fail; 95 goto fail;
80 96
97 gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)",
98 sizeof(struct gfs2_glock) +
99 sizeof(struct address_space),
100 0, 0, gfs2_init_gl_aspace_once);
101
102 if (!gfs2_glock_aspace_cachep)
103 goto fail;
104
81 gfs2_inode_cachep = kmem_cache_create("gfs2_inode", 105 gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
82 sizeof(struct gfs2_inode), 106 sizeof(struct gfs2_inode),
83 0, SLAB_RECLAIM_ACCOUNT| 107 0, SLAB_RECLAIM_ACCOUNT|
@@ -144,6 +168,9 @@ fail:
144 if (gfs2_inode_cachep) 168 if (gfs2_inode_cachep)
145 kmem_cache_destroy(gfs2_inode_cachep); 169 kmem_cache_destroy(gfs2_inode_cachep);
146 170
171 if (gfs2_glock_aspace_cachep)
172 kmem_cache_destroy(gfs2_glock_aspace_cachep);
173
147 if (gfs2_glock_cachep) 174 if (gfs2_glock_cachep)
148 kmem_cache_destroy(gfs2_glock_cachep); 175 kmem_cache_destroy(gfs2_glock_cachep);
149 176
@@ -169,6 +196,7 @@ static void __exit exit_gfs2_fs(void)
169 kmem_cache_destroy(gfs2_rgrpd_cachep); 196 kmem_cache_destroy(gfs2_rgrpd_cachep);
170 kmem_cache_destroy(gfs2_bufdata_cachep); 197 kmem_cache_destroy(gfs2_bufdata_cachep);
171 kmem_cache_destroy(gfs2_inode_cachep); 198 kmem_cache_destroy(gfs2_inode_cachep);
199 kmem_cache_destroy(gfs2_glock_aspace_cachep);
172 kmem_cache_destroy(gfs2_glock_cachep); 200 kmem_cache_destroy(gfs2_glock_cachep);
173 201
174 gfs2_sys_uninit(); 202 gfs2_sys_uninit();
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index cb8d7a93d5ec..0bb12c80937a 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -93,49 +93,13 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
93 return err; 93 return err;
94} 94}
95 95
96static const struct address_space_operations aspace_aops = { 96const struct address_space_operations gfs2_meta_aops = {
97 .writepage = gfs2_aspace_writepage, 97 .writepage = gfs2_aspace_writepage,
98 .releasepage = gfs2_releasepage, 98 .releasepage = gfs2_releasepage,
99 .sync_page = block_sync_page, 99 .sync_page = block_sync_page,
100}; 100};
101 101
102/** 102/**
103 * gfs2_aspace_get - Create and initialize a struct inode structure
104 * @sdp: the filesystem the aspace is in
105 *
106 * Right now a struct inode is just a struct inode. Maybe Linux
107 * will supply a more lightweight address space construct (that works)
108 * in the future.
109 *
110 * Make sure pages/buffers in this aspace aren't in high memory.
111 *
112 * Returns: the aspace
113 */
114
115struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
116{
117 struct inode *aspace;
118 struct gfs2_inode *ip;
119
120 aspace = new_inode(sdp->sd_vfs);
121 if (aspace) {
122 mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
123 aspace->i_mapping->a_ops = &aspace_aops;
124 aspace->i_size = ~0ULL;
125 ip = GFS2_I(aspace);
126 clear_bit(GIF_USER, &ip->i_flags);
127 insert_inode_hash(aspace);
128 }
129 return aspace;
130}
131
132void gfs2_aspace_put(struct inode *aspace)
133{
134 remove_inode_hash(aspace);
135 iput(aspace);
136}
137
138/**
139 * gfs2_meta_sync - Sync all buffers associated with a glock 103 * gfs2_meta_sync - Sync all buffers associated with a glock
140 * @gl: The glock 104 * @gl: The glock
141 * 105 *
@@ -143,7 +107,7 @@ void gfs2_aspace_put(struct inode *aspace)
143 107
144void gfs2_meta_sync(struct gfs2_glock *gl) 108void gfs2_meta_sync(struct gfs2_glock *gl)
145{ 109{
146 struct address_space *mapping = gl->gl_aspace->i_mapping; 110 struct address_space *mapping = gfs2_glock2aspace(gl);
147 int error; 111 int error;
148 112
149 filemap_fdatawrite(mapping); 113 filemap_fdatawrite(mapping);
@@ -164,7 +128,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
164 128
165struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) 129struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
166{ 130{
167 struct address_space *mapping = gl->gl_aspace->i_mapping; 131 struct address_space *mapping = gfs2_glock2aspace(gl);
168 struct gfs2_sbd *sdp = gl->gl_sbd; 132 struct gfs2_sbd *sdp = gl->gl_sbd;
169 struct page *page; 133 struct page *page;
170 struct buffer_head *bh; 134 struct buffer_head *bh;
@@ -344,8 +308,10 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
344 308
345void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta) 309void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
346{ 310{
347 struct gfs2_sbd *sdp = GFS2_SB(bh->b_page->mapping->host); 311 struct address_space *mapping = bh->b_page->mapping;
312 struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
348 struct gfs2_bufdata *bd = bh->b_private; 313 struct gfs2_bufdata *bd = bh->b_private;
314
349 if (test_clear_buffer_pinned(bh)) { 315 if (test_clear_buffer_pinned(bh)) {
350 list_del_init(&bd->bd_le.le_list); 316 list_del_init(&bd->bd_le.le_list);
351 if (meta) { 317 if (meta) {
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index de270c2f9b63..6a1d9ba16411 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -37,8 +37,16 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
37 0, from_head - to_head); 37 0, from_head - to_head);
38} 38}
39 39
40struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp); 40extern const struct address_space_operations gfs2_meta_aops;
41void gfs2_aspace_put(struct inode *aspace); 41
42static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
43{
44 struct inode *inode = mapping->host;
45 if (mapping->a_ops == &gfs2_meta_aops)
46 return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
47 else
48 return inode->i_sb->s_fs_info;
49}
42 50
43void gfs2_meta_sync(struct gfs2_glock *gl); 51void gfs2_meta_sync(struct gfs2_glock *gl);
44 52
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index edfee24f3636..c1309ed1c496 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -65,7 +65,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
65 gt->gt_quota_scale_den = 1; 65 gt->gt_quota_scale_den = 1;
66 gt->gt_new_files_jdata = 0; 66 gt->gt_new_files_jdata = 0;
67 gt->gt_max_readahead = 1 << 18; 67 gt->gt_max_readahead = 1 << 18;
68 gt->gt_stall_secs = 600;
69 gt->gt_complain_secs = 10; 68 gt->gt_complain_secs = 10;
70} 69}
71 70
@@ -82,6 +81,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
82 81
83 gfs2_tune_init(&sdp->sd_tune); 82 gfs2_tune_init(&sdp->sd_tune);
84 83
84 init_waitqueue_head(&sdp->sd_glock_wait);
85 atomic_set(&sdp->sd_glock_disposal, 0);
85 spin_lock_init(&sdp->sd_statfs_spin); 86 spin_lock_init(&sdp->sd_statfs_spin);
86 87
87 spin_lock_init(&sdp->sd_rindex_spin); 88 spin_lock_init(&sdp->sd_rindex_spin);
@@ -723,7 +724,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
723 goto fail; 724 goto fail;
724 } 725 }
725 726
726 error = -EINVAL; 727 error = -EUSERS;
727 if (!gfs2_jindex_size(sdp)) { 728 if (!gfs2_jindex_size(sdp)) {
728 fs_err(sdp, "no journals!\n"); 729 fs_err(sdp, "no journals!\n");
729 goto fail_jindex; 730 goto fail_jindex;
@@ -983,16 +984,24 @@ static const match_table_t nolock_tokens = {
983 { Opt_err, NULL }, 984 { Opt_err, NULL },
984}; 985};
985 986
987static void nolock_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
988{
989 struct gfs2_sbd *sdp = gl->gl_sbd;
990 kmem_cache_free(cachep, gl);
991 if (atomic_dec_and_test(&sdp->sd_glock_disposal))
992 wake_up(&sdp->sd_glock_wait);
993}
994
986static const struct lm_lockops nolock_ops = { 995static const struct lm_lockops nolock_ops = {
987 .lm_proto_name = "lock_nolock", 996 .lm_proto_name = "lock_nolock",
988 .lm_put_lock = kmem_cache_free, 997 .lm_put_lock = nolock_put_lock,
989 .lm_tokens = &nolock_tokens, 998 .lm_tokens = &nolock_tokens,
990}; 999};
991 1000
992/** 1001/**
993 * gfs2_lm_mount - mount a locking protocol 1002 * gfs2_lm_mount - mount a locking protocol
994 * @sdp: the filesystem 1003 * @sdp: the filesystem
995 * @args: mount arguements 1004 * @args: mount arguments
996 * @silent: if 1, don't complain if the FS isn't a GFS2 fs 1005 * @silent: if 1, don't complain if the FS isn't a GFS2 fs
997 * 1006 *
998 * Returns: errno 1007 * Returns: errno
@@ -1231,10 +1240,9 @@ fail_sb:
1231fail_locking: 1240fail_locking:
1232 init_locking(sdp, &mount_gh, UNDO); 1241 init_locking(sdp, &mount_gh, UNDO);
1233fail_lm: 1242fail_lm:
1243 invalidate_inodes(sb);
1234 gfs2_gl_hash_clear(sdp); 1244 gfs2_gl_hash_clear(sdp);
1235 gfs2_lm_unmount(sdp); 1245 gfs2_lm_unmount(sdp);
1236 while (invalidate_inodes(sb))
1237 yield();
1238fail_sys: 1246fail_sys:
1239 gfs2_sys_fs_del(sdp); 1247 gfs2_sys_fs_del(sdp);
1240fail: 1248fail:
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 247436c10deb..4e64352d49de 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -748,7 +748,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
748 struct gfs2_rgrpd *nrgd; 748 struct gfs2_rgrpd *nrgd;
749 unsigned int num_gh; 749 unsigned int num_gh;
750 int dir_rename = 0; 750 int dir_rename = 0;
751 int alloc_required; 751 int alloc_required = 0;
752 unsigned int x; 752 unsigned int x;
753 int error; 753 int error;
754 754
@@ -867,7 +867,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
867 goto out_gunlock; 867 goto out_gunlock;
868 } 868 }
869 869
870 alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); 870 if (nip == NULL)
871 alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
872 error = alloc_required;
871 if (error < 0) 873 if (error < 0)
872 goto out_gunlock; 874 goto out_gunlock;
873 error = 0; 875 error = 0;
@@ -974,121 +976,62 @@ out:
974} 976}
975 977
976/** 978/**
977 * gfs2_readlinki - return the contents of a symlink 979 * gfs2_follow_link - Follow a symbolic link
978 * @ip: the symlink's inode 980 * @dentry: The dentry of the link
979 * @buf: a pointer to the buffer to be filled 981 * @nd: Data that we pass to vfs_follow_link()
980 * @len: a pointer to the length of @buf
981 * 982 *
982 * If @buf is too small, a piece of memory is kmalloc()ed and needs 983 * This can handle symlinks of any size.
983 * to be freed by the caller.
984 * 984 *
985 * Returns: errno 985 * Returns: 0 on success or error code
986 */ 986 */
987 987
988static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) 988static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
989{ 989{
990 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
990 struct gfs2_holder i_gh; 991 struct gfs2_holder i_gh;
991 struct buffer_head *dibh; 992 struct buffer_head *dibh;
992 unsigned int x; 993 unsigned int x;
994 char *buf;
993 int error; 995 int error;
994 996
995 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); 997 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
996 error = gfs2_glock_nq(&i_gh); 998 error = gfs2_glock_nq(&i_gh);
997 if (error) { 999 if (error) {
998 gfs2_holder_uninit(&i_gh); 1000 gfs2_holder_uninit(&i_gh);
999 return error; 1001 nd_set_link(nd, ERR_PTR(error));
1002 return NULL;
1000 } 1003 }
1001 1004
1002 if (!ip->i_disksize) { 1005 if (!ip->i_disksize) {
1003 gfs2_consist_inode(ip); 1006 gfs2_consist_inode(ip);
1004 error = -EIO; 1007 buf = ERR_PTR(-EIO);
1005 goto out; 1008 goto out;
1006 } 1009 }
1007 1010
1008 error = gfs2_meta_inode_buffer(ip, &dibh); 1011 error = gfs2_meta_inode_buffer(ip, &dibh);
1009 if (error) 1012 if (error) {
1013 buf = ERR_PTR(error);
1010 goto out; 1014 goto out;
1011
1012 x = ip->i_disksize + 1;
1013 if (x > *len) {
1014 *buf = kmalloc(x, GFP_NOFS);
1015 if (!*buf) {
1016 error = -ENOMEM;
1017 goto out_brelse;
1018 }
1019 } 1015 }
1020 1016
1021 memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x); 1017 x = ip->i_disksize + 1;
1022 *len = x; 1018 buf = kmalloc(x, GFP_NOFS);
1023 1019 if (!buf)
1024out_brelse: 1020 buf = ERR_PTR(-ENOMEM);
1021 else
1022 memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
1025 brelse(dibh); 1023 brelse(dibh);
1026out: 1024out:
1027 gfs2_glock_dq_uninit(&i_gh); 1025 gfs2_glock_dq_uninit(&i_gh);
1028 return error; 1026 nd_set_link(nd, buf);
1029} 1027 return NULL;
1030
1031/**
1032 * gfs2_readlink - Read the value of a symlink
1033 * @dentry: the symlink
1034 * @buf: the buffer to read the symlink data into
1035 * @size: the size of the buffer
1036 *
1037 * Returns: errno
1038 */
1039
1040static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
1041 int user_size)
1042{
1043 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
1044 char array[GFS2_FAST_NAME_SIZE], *buf = array;
1045 unsigned int len = GFS2_FAST_NAME_SIZE;
1046 int error;
1047
1048 error = gfs2_readlinki(ip, &buf, &len);
1049 if (error)
1050 return error;
1051
1052 if (user_size > len - 1)
1053 user_size = len - 1;
1054
1055 if (copy_to_user(user_buf, buf, user_size))
1056 error = -EFAULT;
1057 else
1058 error = user_size;
1059
1060 if (buf != array)
1061 kfree(buf);
1062
1063 return error;
1064} 1028}
1065 1029
1066/** 1030static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
1067 * gfs2_follow_link - Follow a symbolic link
1068 * @dentry: The dentry of the link
1069 * @nd: Data that we pass to vfs_follow_link()
1070 *
1071 * This can handle symlinks of any size. It is optimised for symlinks
1072 * under GFS2_FAST_NAME_SIZE.
1073 *
1074 * Returns: 0 on success or error code
1075 */
1076
1077static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
1078{ 1031{
1079 struct gfs2_inode *ip = GFS2_I(dentry->d_inode); 1032 char *s = nd_get_link(nd);
1080 char array[GFS2_FAST_NAME_SIZE], *buf = array; 1033 if (!IS_ERR(s))
1081 unsigned int len = GFS2_FAST_NAME_SIZE; 1034 kfree(s);
1082 int error;
1083
1084 error = gfs2_readlinki(ip, &buf, &len);
1085 if (!error) {
1086 error = vfs_follow_link(nd, buf);
1087 if (buf != array)
1088 kfree(buf);
1089 }
1090
1091 return ERR_PTR(error);
1092} 1035}
1093 1036
1094/** 1037/**
@@ -1423,8 +1366,9 @@ const struct inode_operations gfs2_dir_iops = {
1423}; 1366};
1424 1367
1425const struct inode_operations gfs2_symlink_iops = { 1368const struct inode_operations gfs2_symlink_iops = {
1426 .readlink = gfs2_readlink, 1369 .readlink = generic_readlink,
1427 .follow_link = gfs2_follow_link, 1370 .follow_link = gfs2_follow_link,
1371 .put_link = gfs2_put_link,
1428 .permission = gfs2_permission, 1372 .permission = gfs2_permission,
1429 .setattr = gfs2_setattr, 1373 .setattr = gfs2_setattr,
1430 .getattr = gfs2_getattr, 1374 .getattr = gfs2_getattr,
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e3bf6eab8750..6dbcbad6ab17 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1083,7 +1083,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
1083 } 1083 }
1084} 1084}
1085 1085
1086int gfs2_quota_sync(struct super_block *sb, int type) 1086int gfs2_quota_sync(struct super_block *sb, int type, int wait)
1087{ 1087{
1088 struct gfs2_sbd *sdp = sb->s_fs_info; 1088 struct gfs2_sbd *sdp = sb->s_fs_info;
1089 struct gfs2_quota_data **qda; 1089 struct gfs2_quota_data **qda;
@@ -1127,6 +1127,11 @@ int gfs2_quota_sync(struct super_block *sb, int type)
1127 return error; 1127 return error;
1128} 1128}
1129 1129
1130static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
1131{
1132 return gfs2_quota_sync(sb, type, 0);
1133}
1134
1130int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) 1135int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
1131{ 1136{
1132 struct gfs2_quota_data *qd; 1137 struct gfs2_quota_data *qd;
@@ -1382,7 +1387,7 @@ int gfs2_quotad(void *data)
1382 &tune->gt_statfs_quantum); 1387 &tune->gt_statfs_quantum);
1383 1388
1384 /* Update quota file */ 1389 /* Update quota file */
1385 quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, 1390 quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t,
1386 &quotad_timeo, &tune->gt_quota_quantum); 1391 &quotad_timeo, &tune->gt_quota_quantum);
1387 1392
1388 /* Check for & recover partially truncated inodes */ 1393 /* Check for & recover partially truncated inodes */
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index e271fa07ad02..195f60c8bd14 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -25,7 +25,7 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
25extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, 25extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
26 u32 uid, u32 gid); 26 u32 uid, u32 gid);
27 27
28extern int gfs2_quota_sync(struct super_block *sb, int type); 28extern int gfs2_quota_sync(struct super_block *sb, int type, int wait);
29extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); 29extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
30 30
31extern int gfs2_quota_init(struct gfs2_sbd *sdp); 31extern int gfs2_quota_init(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 0608f490c295..503b842f3ba2 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -591,11 +591,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
591 u64 rgrp_count = ip->i_disksize; 591 u64 rgrp_count = ip->i_disksize;
592 int error; 592 int error;
593 593
594 if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) { 594 do_div(rgrp_count, sizeof(struct gfs2_rindex));
595 gfs2_consist_inode(ip);
596 return -EIO;
597 }
598
599 clear_rgrpdi(sdp); 595 clear_rgrpdi(sdp);
600 596
601 file_ra_state_init(&ra_state, inode->i_mapping); 597 file_ra_state_init(&ra_state, inode->i_mapping);
@@ -915,7 +911,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
915struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) 911struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
916{ 912{
917 BUG_ON(ip->i_alloc != NULL); 913 BUG_ON(ip->i_alloc != NULL);
918 ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_KERNEL); 914 ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS);
919 return ip->i_alloc; 915 return ip->i_alloc;
920} 916}
921 917
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b4106ddaaa98..f07119d89557 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -10,6 +10,8 @@
10#ifndef __RGRP_DOT_H__ 10#ifndef __RGRP_DOT_H__
11#define __RGRP_DOT_H__ 11#define __RGRP_DOT_H__
12 12
13#include <linux/slab.h>
14
13struct gfs2_rgrpd; 15struct gfs2_rgrpd;
14struct gfs2_sbd; 16struct gfs2_sbd;
15struct gfs2_holder; 17struct gfs2_holder;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index c282ad41f3d1..50aac606b990 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -21,6 +21,8 @@
21#include <linux/gfs2_ondisk.h> 21#include <linux/gfs2_ondisk.h>
22#include <linux/crc32.h> 22#include <linux/crc32.h>
23#include <linux/time.h> 23#include <linux/time.h>
24#include <linux/wait.h>
25#include <linux/writeback.h>
24 26
25#include "gfs2.h" 27#include "gfs2.h"
26#include "incore.h" 28#include "incore.h"
@@ -710,7 +712,7 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
710 * Returns: errno 712 * Returns: errno
711 */ 713 */
712 714
713static int gfs2_write_inode(struct inode *inode, int sync) 715static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
714{ 716{
715 struct gfs2_inode *ip = GFS2_I(inode); 717 struct gfs2_inode *ip = GFS2_I(inode);
716 struct gfs2_sbd *sdp = GFS2_SB(inode); 718 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -721,8 +723,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
721 int ret = 0; 723 int ret = 0;
722 724
723 /* Check this is a "normal" inode, etc */ 725 /* Check this is a "normal" inode, etc */
724 if (!test_bit(GIF_USER, &ip->i_flags) || 726 if (current->flags & PF_MEMALLOC)
725 (current->flags & PF_MEMALLOC))
726 return 0; 727 return 0;
727 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 728 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
728 if (ret) 729 if (ret)
@@ -745,7 +746,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
745do_unlock: 746do_unlock:
746 gfs2_glock_dq_uninit(&gh); 747 gfs2_glock_dq_uninit(&gh);
747do_flush: 748do_flush:
748 if (sync != 0) 749 if (wbc->sync_mode == WB_SYNC_ALL)
749 gfs2_log_flush(GFS2_SB(inode), ip->i_gl); 750 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
750 return ret; 751 return ret;
751} 752}
@@ -763,7 +764,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
763 int error; 764 int error;
764 765
765 flush_workqueue(gfs2_delete_workqueue); 766 flush_workqueue(gfs2_delete_workqueue);
766 gfs2_quota_sync(sdp->sd_vfs, 0); 767 gfs2_quota_sync(sdp->sd_vfs, 0, 1);
767 gfs2_statfs_sync(sdp->sd_vfs, 0); 768 gfs2_statfs_sync(sdp->sd_vfs, 0);
768 769
769 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, 770 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
@@ -859,6 +860,7 @@ restart:
859 gfs2_clear_rgrpd(sdp); 860 gfs2_clear_rgrpd(sdp);
860 gfs2_jindex_free(sdp); 861 gfs2_jindex_free(sdp);
861 /* Take apart glock structures and buffer lists */ 862 /* Take apart glock structures and buffer lists */
863 invalidate_inodes(sdp->sd_vfs);
862 gfs2_gl_hash_clear(sdp); 864 gfs2_gl_hash_clear(sdp);
863 /* Unmount the locking protocol */ 865 /* Unmount the locking protocol */
864 gfs2_lm_unmount(sdp); 866 gfs2_lm_unmount(sdp);
@@ -1193,7 +1195,7 @@ static void gfs2_drop_inode(struct inode *inode)
1193{ 1195{
1194 struct gfs2_inode *ip = GFS2_I(inode); 1196 struct gfs2_inode *ip = GFS2_I(inode);
1195 1197
1196 if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) { 1198 if (inode->i_nlink) {
1197 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; 1199 struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
1198 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) 1200 if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
1199 clear_nlink(inode); 1201 clear_nlink(inode);
@@ -1211,18 +1213,12 @@ static void gfs2_clear_inode(struct inode *inode)
1211{ 1213{
1212 struct gfs2_inode *ip = GFS2_I(inode); 1214 struct gfs2_inode *ip = GFS2_I(inode);
1213 1215
1214 /* This tells us its a "real" inode and not one which only 1216 ip->i_gl->gl_object = NULL;
1215 * serves to contain an address space (see rgrp.c, meta_io.c) 1217 gfs2_glock_put(ip->i_gl);
1216 * which therefore doesn't have its own glocks. 1218 ip->i_gl = NULL;
1217 */ 1219 if (ip->i_iopen_gh.gh_gl) {
1218 if (test_bit(GIF_USER, &ip->i_flags)) { 1220 ip->i_iopen_gh.gh_gl->gl_object = NULL;
1219 ip->i_gl->gl_object = NULL; 1221 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1220 gfs2_glock_put(ip->i_gl);
1221 ip->i_gl = NULL;
1222 if (ip->i_iopen_gh.gh_gl) {
1223 ip->i_iopen_gh.gh_gl->gl_object = NULL;
1224 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
1225 }
1226 } 1222 }
1227} 1223}
1228 1224
@@ -1357,9 +1353,6 @@ static void gfs2_delete_inode(struct inode *inode)
1357 struct gfs2_holder gh; 1353 struct gfs2_holder gh;
1358 int error; 1354 int error;
1359 1355
1360 if (!test_bit(GIF_USER, &ip->i_flags))
1361 goto out;
1362
1363 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 1356 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
1364 if (unlikely(error)) { 1357 if (unlikely(error)) {
1365 gfs2_glock_dq_uninit(&ip->i_iopen_gh); 1358 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 0dc34621f6a6..54fd98425991 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -8,7 +8,6 @@
8 */ 8 */
9 9
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h> 11#include <linux/spinlock.h>
13#include <linux/completion.h> 12#include <linux/completion.h>
14#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
@@ -49,7 +48,7 @@ static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
49 return a->store ? a->store(sdp, buf, len) : len; 48 return a->store ? a->store(sdp, buf, len) : len;
50} 49}
51 50
52static struct sysfs_ops gfs2_attr_ops = { 51static const struct sysfs_ops gfs2_attr_ops = {
53 .show = gfs2_attr_show, 52 .show = gfs2_attr_show,
54 .store = gfs2_attr_store, 53 .store = gfs2_attr_store,
55}; 54};
@@ -167,7 +166,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
167 if (simple_strtol(buf, NULL, 0) != 1) 166 if (simple_strtol(buf, NULL, 0) != 1)
168 return -EINVAL; 167 return -EINVAL;
169 168
170 gfs2_quota_sync(sdp->sd_vfs, 0); 169 gfs2_quota_sync(sdp->sd_vfs, 0, 1);
171 return len; 170 return len;
172} 171}
173 172
@@ -478,7 +477,6 @@ TUNE_ATTR(complain_secs, 0);
478TUNE_ATTR(statfs_slow, 0); 477TUNE_ATTR(statfs_slow, 0);
479TUNE_ATTR(new_files_jdata, 0); 478TUNE_ATTR(new_files_jdata, 0);
480TUNE_ATTR(quota_simul_sync, 1); 479TUNE_ATTR(quota_simul_sync, 1);
481TUNE_ATTR(stall_secs, 1);
482TUNE_ATTR(statfs_quantum, 1); 480TUNE_ATTR(statfs_quantum, 1);
483TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); 481TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
484 482
@@ -491,7 +489,6 @@ static struct attribute *tune_attrs[] = {
491 &tune_attr_complain_secs.attr, 489 &tune_attr_complain_secs.attr,
492 &tune_attr_statfs_slow.attr, 490 &tune_attr_statfs_slow.attr,
493 &tune_attr_quota_simul_sync.attr, 491 &tune_attr_quota_simul_sync.attr,
494 &tune_attr_stall_secs.attr,
495 &tune_attr_statfs_quantum.attr, 492 &tune_attr_statfs_quantum.attr,
496 &tune_attr_quota_scale.attr, 493 &tune_attr_quota_scale.attr,
497 &tune_attr_new_files_jdata.attr, 494 &tune_attr_new_files_jdata.attr,
@@ -576,7 +573,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
576 return 0; 573 return 0;
577} 574}
578 575
579static struct kset_uevent_ops gfs2_uevent_ops = { 576static const struct kset_uevent_ops gfs2_uevent_ops = {
580 .uevent = gfs2_uevent, 577 .uevent = gfs2_uevent,
581}; 578};
582 579
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index f6a7efa34eb9..53511291fe36 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -7,7 +7,6 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/slab.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/completion.h> 11#include <linux/completion.h>
13#include <linux/buffer_head.h> 12#include <linux/buffer_head.h>
@@ -21,6 +20,7 @@
21#include "util.h" 20#include "util.h"
22 21
23struct kmem_cache *gfs2_glock_cachep __read_mostly; 22struct kmem_cache *gfs2_glock_cachep __read_mostly;
23struct kmem_cache *gfs2_glock_aspace_cachep __read_mostly;
24struct kmem_cache *gfs2_inode_cachep __read_mostly; 24struct kmem_cache *gfs2_inode_cachep __read_mostly;
25struct kmem_cache *gfs2_bufdata_cachep __read_mostly; 25struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
26struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; 26struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 33e96b0ce9ab..b432e04600de 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -145,6 +145,7 @@ gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__);
145 145
146 146
147extern struct kmem_cache *gfs2_glock_cachep; 147extern struct kmem_cache *gfs2_glock_cachep;
148extern struct kmem_cache *gfs2_glock_aspace_cachep;
148extern struct kmem_cache *gfs2_inode_cachep; 149extern struct kmem_cache *gfs2_inode_cachep;
149extern struct kmem_cache *gfs2_bufdata_cachep; 150extern struct kmem_cache *gfs2_bufdata_cachep;
150extern struct kmem_cache *gfs2_rgrpd_cachep; 151extern struct kmem_cache *gfs2_rgrpd_cachep;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 8a04108e0c22..c2ebdf2c01d4 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1296,6 +1296,7 @@ fail:
1296 1296
1297int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) 1297int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1298{ 1298{
1299 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1299 struct gfs2_ea_location el; 1300 struct gfs2_ea_location el;
1300 struct buffer_head *dibh; 1301 struct buffer_head *dibh;
1301 int error; 1302 int error;
@@ -1305,16 +1306,17 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1305 return error; 1306 return error;
1306 1307
1307 if (GFS2_EA_IS_STUFFED(el.el_ea)) { 1308 if (GFS2_EA_IS_STUFFED(el.el_ea)) {
1308 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0); 1309 error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0);
1309 if (error) 1310 if (error == 0) {
1310 return error; 1311 gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1);
1311 1312 memcpy(GFS2_EA2DATA(el.el_ea), data,
1312 gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1); 1313 GFS2_EA_DATA_LEN(el.el_ea));
1313 memcpy(GFS2_EA2DATA(el.el_ea), data, 1314 }
1314 GFS2_EA_DATA_LEN(el.el_ea)); 1315 } else {
1315 } else
1316 error = ea_acl_chmod_unstuffed(ip, el.el_ea, data); 1316 error = ea_acl_chmod_unstuffed(ip, el.el_ea, data);
1317 }
1317 1318
1319 brelse(el.el_bh);
1318 if (error) 1320 if (error)
1319 return error; 1321 return error;
1320 1322
@@ -1327,8 +1329,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
1327 brelse(dibh); 1329 brelse(dibh);
1328 } 1330 }
1329 1331
1330 gfs2_trans_end(GFS2_SB(&ip->i_inode)); 1332 gfs2_trans_end(sdp);
1331
1332 return error; 1333 return error;
1333} 1334}
1334 1335
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 0d200068d0af..cdb41a1f6a64 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/slab.h>
12#include <linux/swap.h> 13#include <linux/swap.h>
13 14
14#include "btree.h" 15#include "btree.h"
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 052f214ea6f0..38a0a9917d7f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/slab.h>
12#include <linux/log2.h> 13#include <linux/log2.h>
13 14
14#include "btree.h" 15#include "btree.h"
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 052387e11671..fe35e3b626c4 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -188,7 +188,7 @@ extern const struct address_space_operations hfs_btree_aops;
188 188
189extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int); 189extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int);
190extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); 190extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
191extern int hfs_write_inode(struct inode *, int); 191extern int hfs_write_inode(struct inode *, struct writeback_control *);
192extern int hfs_inode_setattr(struct dentry *, struct iattr *); 192extern int hfs_inode_setattr(struct dentry *, struct iattr *);
193extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, 193extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
194 __be32 log_size, __be32 phys_size, u32 clump_size); 194 __be32 log_size, __be32 phys_size, u32 clump_size);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index a1cbff2b4d99..14f5cb1b9fdc 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -381,7 +381,7 @@ void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext,
381 HFS_SB(inode->i_sb)->alloc_blksz); 381 HFS_SB(inode->i_sb)->alloc_blksz);
382} 382}
383 383
384int hfs_write_inode(struct inode *inode, int unused) 384int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
385{ 385{
386 struct inode *main_inode = inode; 386 struct inode *main_inode = inode;
387 struct hfs_find_data fd; 387 struct hfs_find_data fd;
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 8bbe03c3f6d5..86428f5ac991 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -11,6 +11,7 @@
11#include <linux/cdrom.h> 11#include <linux/cdrom.h>
12#include <linux/genhd.h> 12#include <linux/genhd.h>
13#include <linux/nls.h> 13#include <linux/nls.h>
14#include <linux/slab.h>
14 15
15#include "hfs_fs.h" 16#include "hfs_fs.h"
16#include "btree.h" 17#include "btree.h"
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 5ed7252b7b23..0a81eb7111f3 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -19,6 +19,7 @@
19#include <linux/nls.h> 19#include <linux/nls.h>
20#include <linux/parser.h> 20#include <linux/parser.h>
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/slab.h>
22#include <linux/smp_lock.h> 23#include <linux/smp_lock.h>
23#include <linux/vfs.h> 24#include <linux/vfs.h>
24 25
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 3fcbb0e1f6fc..572628b4b07d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -15,6 +15,7 @@
15#include <linux/nls.h> 15#include <linux/nls.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/seq_file.h> 17#include <linux/seq_file.h>
18#include <linux/slab.h>
18#include "hfsplus_fs.h" 19#include "hfsplus_fs.h"
19 20
20enum { 21enum {
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 43022f3d5148..74b473a8ef92 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -87,7 +87,8 @@ bad_inode:
87 return ERR_PTR(err); 87 return ERR_PTR(err);
88} 88}
89 89
90static int hfsplus_write_inode(struct inode *inode, int unused) 90static int hfsplus_write_inode(struct inode *inode,
91 struct writeback_control *wbc)
91{ 92{
92 struct hfsplus_vh *vhdr; 93 struct hfsplus_vh *vhdr;
93 int ret = 0; 94 int ret = 0;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 032604e5ef2c..3a029d8f4cf1 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -11,6 +11,7 @@
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/statfs.h> 13#include <linux/statfs.h>
14#include <linux/slab.h>
14#include <linux/seq_file.h> 15#include <linux/seq_file.h>
15#include <linux/mount.h> 16#include <linux/mount.h>
16#include "hostfs.h" 17#include "hostfs.h"
diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c
index 1aa88c4e0964..6a2f04bf3df0 100644
--- a/fs/hpfs/anode.c
+++ b/fs/hpfs/anode.c
@@ -353,7 +353,7 @@ int hpfs_ea_read(struct super_block *s, secno a, int ano, unsigned pos,
353} 353}
354 354
355int hpfs_ea_write(struct super_block *s, secno a, int ano, unsigned pos, 355int hpfs_ea_write(struct super_block *s, secno a, int ano, unsigned pos,
356 unsigned len, char *buf) 356 unsigned len, const char *buf)
357{ 357{
358 struct buffer_head *bh; 358 struct buffer_head *bh;
359 char *data; 359 char *data;
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index b6fca543544c..eac5f96323e3 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -6,6 +6,7 @@
6 * general buffer i/o 6 * general buffer i/o
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/slab.h>
9#include "hpfs_fn.h" 10#include "hpfs_fn.h"
10 11
11void hpfs_lock_creation(struct super_block *s) 12void hpfs_lock_creation(struct super_block *s)
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 940d6d150bee..67d9d36b3d5f 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -20,8 +20,8 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
20 20
21 if (l == 1) if (qstr->name[0]=='.') goto x; 21 if (l == 1) if (qstr->name[0]=='.') goto x;
22 if (l == 2) if (qstr->name[0]=='.' || qstr->name[1]=='.') goto x; 22 if (l == 2) if (qstr->name[0]=='.' || qstr->name[1]=='.') goto x;
23 hpfs_adjust_length((char *)qstr->name, &l); 23 hpfs_adjust_length(qstr->name, &l);
24 /*if (hpfs_chk_name((char *)qstr->name,&l))*/ 24 /*if (hpfs_chk_name(qstr->name,&l))*/
25 /*return -ENAMETOOLONG;*/ 25 /*return -ENAMETOOLONG;*/
26 /*return -ENOENT;*/ 26 /*return -ENOENT;*/
27 x: 27 x:
@@ -38,14 +38,16 @@ static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qst
38{ 38{
39 unsigned al=a->len; 39 unsigned al=a->len;
40 unsigned bl=b->len; 40 unsigned bl=b->len;
41 hpfs_adjust_length((char *)a->name, &al); 41 hpfs_adjust_length(a->name, &al);
42 /*hpfs_adjust_length((char *)b->name, &bl);*/ 42 /*hpfs_adjust_length(b->name, &bl);*/
43 /* 'a' is the qstr of an already existing dentry, so the name 43 /* 'a' is the qstr of an already existing dentry, so the name
44 * must be valid. 'b' must be validated first. 44 * must be valid. 'b' must be validated first.
45 */ 45 */
46 46
47 if (hpfs_chk_name((char *)b->name, &bl)) return 1; 47 if (hpfs_chk_name(b->name, &bl))
48 if (hpfs_compare_names(dentry->d_sb, (char *)a->name, al, (char *)b->name, bl, 0)) return 1; 48 return 1;
49 if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0))
50 return 1;
49 return 0; 51 return 0;
50} 52}
51 53
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 8865c94f55f6..2338130cceba 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/slab.h>
10#include "hpfs_fn.h" 11#include "hpfs_fn.h"
11 12
12static int hpfs_dir_release(struct inode *inode, struct file *filp) 13static int hpfs_dir_release(struct inode *inode, struct file *filp)
@@ -59,7 +60,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
59 struct hpfs_dirent *de; 60 struct hpfs_dirent *de;
60 int lc; 61 int lc;
61 long old_pos; 62 long old_pos;
62 char *tempname; 63 unsigned char *tempname;
63 int c1, c2 = 0; 64 int c1, c2 = 0;
64 int ret = 0; 65 int ret = 0;
65 66
@@ -158,11 +159,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
158 tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3); 159 tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3);
159 if (filldir(dirent, tempname, de->namelen, old_pos, de->fnode, DT_UNKNOWN) < 0) { 160 if (filldir(dirent, tempname, de->namelen, old_pos, de->fnode, DT_UNKNOWN) < 0) {
160 filp->f_pos = old_pos; 161 filp->f_pos = old_pos;
161 if (tempname != (char *)de->name) kfree(tempname); 162 if (tempname != de->name) kfree(tempname);
162 hpfs_brelse4(&qbh); 163 hpfs_brelse4(&qbh);
163 goto out; 164 goto out;
164 } 165 }
165 if (tempname != (char *)de->name) kfree(tempname); 166 if (tempname != de->name) kfree(tempname);
166 hpfs_brelse4(&qbh); 167 hpfs_brelse4(&qbh);
167 } 168 }
168out: 169out:
@@ -187,7 +188,7 @@ out:
187 188
188struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 189struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
189{ 190{
190 const char *name = dentry->d_name.name; 191 const unsigned char *name = dentry->d_name.name;
191 unsigned len = dentry->d_name.len; 192 unsigned len = dentry->d_name.len;
192 struct quad_buffer_head qbh; 193 struct quad_buffer_head qbh;
193 struct hpfs_dirent *de; 194 struct hpfs_dirent *de;
@@ -197,7 +198,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
197 struct hpfs_inode_info *hpfs_result; 198 struct hpfs_inode_info *hpfs_result;
198 199
199 lock_kernel(); 200 lock_kernel();
200 if ((err = hpfs_chk_name((char *)name, &len))) { 201 if ((err = hpfs_chk_name(name, &len))) {
201 if (err == -ENAMETOOLONG) { 202 if (err == -ENAMETOOLONG) {
202 unlock_kernel(); 203 unlock_kernel();
203 return ERR_PTR(-ENAMETOOLONG); 204 return ERR_PTR(-ENAMETOOLONG);
@@ -209,7 +210,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
209 * '.' and '..' will never be passed here. 210 * '.' and '..' will never be passed here.
210 */ 211 */
211 212
212 de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *) name, len, NULL, &qbh); 213 de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, NULL, &qbh);
213 214
214 /* 215 /*
215 * This is not really a bailout, just means file not found. 216 * This is not really a bailout, just means file not found.
@@ -250,7 +251,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
250 hpfs_result = hpfs_i(result); 251 hpfs_result = hpfs_i(result);
251 if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino; 252 if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino;
252 253
253 hpfs_decide_conv(result, (char *)name, len); 254 hpfs_decide_conv(result, name, len);
254 255
255 if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) { 256 if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) {
256 hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures"); 257 hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures");
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index fe83c2b7d2d8..9b2ffadfc8c4 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -158,7 +158,8 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno
158 158
159/* Add an entry to dnode and don't care if it grows over 2048 bytes */ 159/* Add an entry to dnode and don't care if it grows over 2048 bytes */
160 160
161struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d, unsigned char *name, 161struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d,
162 const unsigned char *name,
162 unsigned namelen, secno down_ptr) 163 unsigned namelen, secno down_ptr)
163{ 164{
164 struct hpfs_dirent *de; 165 struct hpfs_dirent *de;
@@ -223,7 +224,7 @@ static void fix_up_ptrs(struct super_block *s, struct dnode *d)
223/* Add an entry to dnode and do dnode splitting if required */ 224/* Add an entry to dnode and do dnode splitting if required */
224 225
225static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, 226static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
226 unsigned char *name, unsigned namelen, 227 const unsigned char *name, unsigned namelen,
227 struct hpfs_dirent *new_de, dnode_secno down_ptr) 228 struct hpfs_dirent *new_de, dnode_secno down_ptr)
228{ 229{
229 struct quad_buffer_head qbh, qbh1, qbh2; 230 struct quad_buffer_head qbh, qbh1, qbh2;
@@ -231,7 +232,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
231 dnode_secno adno, rdno; 232 dnode_secno adno, rdno;
232 struct hpfs_dirent *de; 233 struct hpfs_dirent *de;
233 struct hpfs_dirent nde; 234 struct hpfs_dirent nde;
234 char *nname; 235 unsigned char *nname;
235 int h; 236 int h;
236 int pos; 237 int pos;
237 struct buffer_head *bh; 238 struct buffer_head *bh;
@@ -305,7 +306,9 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
305 pos++; 306 pos++;
306 } 307 }
307 copy_de(new_de = &nde, de); 308 copy_de(new_de = &nde, de);
308 memcpy(name = nname, de->name, namelen = de->namelen); 309 memcpy(nname, de->name, de->namelen);
310 name = nname;
311 namelen = de->namelen;
309 for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, 4); 312 for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, 4);
310 down_ptr = adno; 313 down_ptr = adno;
311 set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0); 314 set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0);
@@ -368,7 +371,8 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
368 * I hope, now it's finally bug-free. 371 * I hope, now it's finally bug-free.
369 */ 372 */
370 373
371int hpfs_add_dirent(struct inode *i, unsigned char *name, unsigned namelen, 374int hpfs_add_dirent(struct inode *i,
375 const unsigned char *name, unsigned namelen,
372 struct hpfs_dirent *new_de, int cdepth) 376 struct hpfs_dirent *new_de, int cdepth)
373{ 377{
374 struct hpfs_inode_info *hpfs_inode = hpfs_i(i); 378 struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
@@ -897,7 +901,8 @@ struct hpfs_dirent *map_pos_dirent(struct inode *inode, loff_t *posp,
897 901
898/* Find a dirent in tree */ 902/* Find a dirent in tree */
899 903
900struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno, char *name, unsigned len, 904struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno,
905 const unsigned char *name, unsigned len,
901 dnode_secno *dd, struct quad_buffer_head *qbh) 906 dnode_secno *dd, struct quad_buffer_head *qbh)
902{ 907{
903 struct dnode *dnode; 908 struct dnode *dnode;
@@ -988,8 +993,8 @@ void hpfs_remove_dtree(struct super_block *s, dnode_secno dno)
988struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, 993struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
989 struct fnode *f, struct quad_buffer_head *qbh) 994 struct fnode *f, struct quad_buffer_head *qbh)
990{ 995{
991 char *name1; 996 unsigned char *name1;
992 char *name2; 997 unsigned char *name2;
993 int name1len, name2len; 998 int name1len, name2len;
994 struct dnode *d; 999 struct dnode *d;
995 dnode_secno dno, downd; 1000 dnode_secno dno, downd;
diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c
index 547a8384571f..45e53d972b42 100644
--- a/fs/hpfs/ea.c
+++ b/fs/hpfs/ea.c
@@ -62,8 +62,8 @@ static char *get_indirect_ea(struct super_block *s, int ano, secno a, int size)
62 return ret; 62 return ret;
63} 63}
64 64
65static void set_indirect_ea(struct super_block *s, int ano, secno a, char *data, 65static void set_indirect_ea(struct super_block *s, int ano, secno a,
66 int size) 66 const char *data, int size)
67{ 67{
68 hpfs_ea_write(s, a, ano, 0, size, data); 68 hpfs_ea_write(s, a, ano, 0, size, data);
69} 69}
@@ -186,7 +186,8 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
186 * This driver can't change sizes of eas ('cause I just don't need it). 186 * This driver can't change sizes of eas ('cause I just don't need it).
187 */ 187 */
188 188
189void hpfs_set_ea(struct inode *inode, struct fnode *fnode, char *key, char *data, int size) 189void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
190 const char *data, int size)
190{ 191{
191 fnode_secno fno = inode->i_ino; 192 fnode_secno fno = inode->i_ino;
192 struct super_block *s = inode->i_sb; 193 struct super_block *s = inode->i_sb;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 701ca54c0867..97bf738cd5d6 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -215,7 +215,7 @@ secno hpfs_bplus_lookup(struct super_block *, struct inode *, struct bplus_heade
215secno hpfs_add_sector_to_btree(struct super_block *, secno, int, unsigned); 215secno hpfs_add_sector_to_btree(struct super_block *, secno, int, unsigned);
216void hpfs_remove_btree(struct super_block *, struct bplus_header *); 216void hpfs_remove_btree(struct super_block *, struct bplus_header *);
217int hpfs_ea_read(struct super_block *, secno, int, unsigned, unsigned, char *); 217int hpfs_ea_read(struct super_block *, secno, int, unsigned, unsigned, char *);
218int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, char *); 218int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, const char *);
219void hpfs_ea_remove(struct super_block *, secno, int, unsigned); 219void hpfs_ea_remove(struct super_block *, secno, int, unsigned);
220void hpfs_truncate_btree(struct super_block *, secno, int, unsigned); 220void hpfs_truncate_btree(struct super_block *, secno, int, unsigned);
221void hpfs_remove_fnode(struct super_block *, fnode_secno fno); 221void hpfs_remove_fnode(struct super_block *, fnode_secno fno);
@@ -244,13 +244,17 @@ extern const struct file_operations hpfs_dir_ops;
244 244
245void hpfs_add_pos(struct inode *, loff_t *); 245void hpfs_add_pos(struct inode *, loff_t *);
246void hpfs_del_pos(struct inode *, loff_t *); 246void hpfs_del_pos(struct inode *, loff_t *);
247struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *, unsigned char *, unsigned, secno); 247struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *,
248int hpfs_add_dirent(struct inode *, unsigned char *, unsigned, struct hpfs_dirent *, int); 248 const unsigned char *, unsigned, secno);
249int hpfs_add_dirent(struct inode *, const unsigned char *, unsigned,
250 struct hpfs_dirent *, int);
249int hpfs_remove_dirent(struct inode *, dnode_secno, struct hpfs_dirent *, struct quad_buffer_head *, int); 251int hpfs_remove_dirent(struct inode *, dnode_secno, struct hpfs_dirent *, struct quad_buffer_head *, int);
250void hpfs_count_dnodes(struct super_block *, dnode_secno, int *, int *, int *); 252void hpfs_count_dnodes(struct super_block *, dnode_secno, int *, int *, int *);
251dnode_secno hpfs_de_as_down_as_possible(struct super_block *, dnode_secno dno); 253dnode_secno hpfs_de_as_down_as_possible(struct super_block *, dnode_secno dno);
252struct hpfs_dirent *map_pos_dirent(struct inode *, loff_t *, struct quad_buffer_head *); 254struct hpfs_dirent *map_pos_dirent(struct inode *, loff_t *, struct quad_buffer_head *);
253struct hpfs_dirent *map_dirent(struct inode *, dnode_secno, char *, unsigned, dnode_secno *, struct quad_buffer_head *); 255struct hpfs_dirent *map_dirent(struct inode *, dnode_secno,
256 const unsigned char *, unsigned, dnode_secno *,
257 struct quad_buffer_head *);
254void hpfs_remove_dtree(struct super_block *, dnode_secno); 258void hpfs_remove_dtree(struct super_block *, dnode_secno);
255struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct fnode *, struct quad_buffer_head *); 259struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct fnode *, struct quad_buffer_head *);
256 260
@@ -259,7 +263,8 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct f
259void hpfs_ea_ext_remove(struct super_block *, secno, int, unsigned); 263void hpfs_ea_ext_remove(struct super_block *, secno, int, unsigned);
260int hpfs_read_ea(struct super_block *, struct fnode *, char *, char *, int); 264int hpfs_read_ea(struct super_block *, struct fnode *, char *, char *, int);
261char *hpfs_get_ea(struct super_block *, struct fnode *, char *, int *); 265char *hpfs_get_ea(struct super_block *, struct fnode *, char *, int *);
262void hpfs_set_ea(struct inode *, struct fnode *, char *, char *, int); 266void hpfs_set_ea(struct inode *, struct fnode *, const char *,
267 const char *, int);
263 268
264/* file.c */ 269/* file.c */
265 270
@@ -282,7 +287,7 @@ void hpfs_delete_inode(struct inode *);
282 287
283unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *); 288unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
284unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *); 289unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
285char *hpfs_load_code_page(struct super_block *, secno); 290unsigned char *hpfs_load_code_page(struct super_block *, secno);
286secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp); 291secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
287struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **); 292struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
288struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **); 293struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **);
@@ -292,12 +297,13 @@ dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino);
292/* name.c */ 297/* name.c */
293 298
294unsigned char hpfs_upcase(unsigned char *, unsigned char); 299unsigned char hpfs_upcase(unsigned char *, unsigned char);
295int hpfs_chk_name(unsigned char *, unsigned *); 300int hpfs_chk_name(const unsigned char *, unsigned *);
296char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int); 301unsigned char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int);
297int hpfs_compare_names(struct super_block *, unsigned char *, unsigned, unsigned char *, unsigned, int); 302int hpfs_compare_names(struct super_block *, const unsigned char *, unsigned,
298int hpfs_is_name_long(unsigned char *, unsigned); 303 const unsigned char *, unsigned, int);
299void hpfs_adjust_length(unsigned char *, unsigned *); 304int hpfs_is_name_long(const unsigned char *, unsigned);
300void hpfs_decide_conv(struct inode *, unsigned char *, unsigned); 305void hpfs_adjust_length(const unsigned char *, unsigned *);
306void hpfs_decide_conv(struct inode *, const unsigned char *, unsigned);
301 307
302/* namei.c */ 308/* namei.c */
303 309
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index fe703ae46bc7..1042a9bc97f3 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/slab.h>
10#include "hpfs_fn.h" 11#include "hpfs_fn.h"
11 12
12void hpfs_init_inode(struct inode *i) 13void hpfs_init_inode(struct inode *i)
@@ -46,7 +47,7 @@ void hpfs_read_inode(struct inode *i)
46 struct fnode *fnode; 47 struct fnode *fnode;
47 struct super_block *sb = i->i_sb; 48 struct super_block *sb = i->i_sb;
48 struct hpfs_inode_info *hpfs_inode = hpfs_i(i); 49 struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
49 unsigned char *ea; 50 void *ea;
50 int ea_size; 51 int ea_size;
51 52
52 if (!(fnode = hpfs_map_fnode(sb, i->i_ino, &bh))) { 53 if (!(fnode = hpfs_map_fnode(sb, i->i_ino, &bh))) {
@@ -112,7 +113,7 @@ void hpfs_read_inode(struct inode *i)
112 } 113 }
113 } 114 }
114 if (fnode->dirflag) { 115 if (fnode->dirflag) {
115 unsigned n_dnodes, n_subdirs; 116 int n_dnodes, n_subdirs;
116 i->i_mode |= S_IFDIR; 117 i->i_mode |= S_IFDIR;
117 i->i_op = &hpfs_dir_iops; 118 i->i_op = &hpfs_dir_iops;
118 i->i_fop = &hpfs_dir_ops; 119 i->i_fop = &hpfs_dir_ops;
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index c4724589b2eb..840d033ecee8 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -35,7 +35,7 @@ unsigned int *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
35 * lowercasing table 35 * lowercasing table
36 */ 36 */
37 37
38char *hpfs_load_code_page(struct super_block *s, secno cps) 38unsigned char *hpfs_load_code_page(struct super_block *s, secno cps)
39{ 39{
40 struct buffer_head *bh; 40 struct buffer_head *bh;
41 secno cpds; 41 secno cpds;
@@ -71,7 +71,7 @@ char *hpfs_load_code_page(struct super_block *s, secno cps)
71 brelse(bh); 71 brelse(bh);
72 return NULL; 72 return NULL;
73 } 73 }
74 ptr = (char *)cpd + cpd->offs[cpi] + 6; 74 ptr = (unsigned char *)cpd + cpd->offs[cpi] + 6;
75 if (!(cp_table = kmalloc(256, GFP_KERNEL))) { 75 if (!(cp_table = kmalloc(256, GFP_KERNEL))) {
76 printk("HPFS: out of memory for code page table\n"); 76 printk("HPFS: out of memory for code page table\n");
77 brelse(bh); 77 brelse(bh);
@@ -217,7 +217,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno,
217 if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD))) 217 if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD)))
218 if (hpfs_sb(s)->sb_chk) { 218 if (hpfs_sb(s)->sb_chk) {
219 unsigned p, pp = 0; 219 unsigned p, pp = 0;
220 unsigned char *d = (char *)dnode; 220 unsigned char *d = (unsigned char *)dnode;
221 int b = 0; 221 int b = 0;
222 if (dnode->magic != DNODE_MAGIC) { 222 if (dnode->magic != DNODE_MAGIC) {
223 hpfs_error(s, "bad magic on dnode %08x", secno); 223 hpfs_error(s, "bad magic on dnode %08x", secno);
diff --git a/fs/hpfs/name.c b/fs/hpfs/name.c
index 1f4a964384eb..f24736d7a439 100644
--- a/fs/hpfs/name.c
+++ b/fs/hpfs/name.c
@@ -8,16 +8,16 @@
8 8
9#include "hpfs_fn.h" 9#include "hpfs_fn.h"
10 10
11static char *text_postfix[]={ 11static const char *text_postfix[]={
12".ASM", ".BAS", ".BAT", ".C", ".CC", ".CFG", ".CMD", ".CON", ".CPP", ".DEF", 12".ASM", ".BAS", ".BAT", ".C", ".CC", ".CFG", ".CMD", ".CON", ".CPP", ".DEF",
13".DOC", ".DPR", ".ERX", ".H", ".HPP", ".HTM", ".HTML", ".JAVA", ".LOG", ".PAS", 13".DOC", ".DPR", ".ERX", ".H", ".HPP", ".HTM", ".HTML", ".JAVA", ".LOG", ".PAS",
14".RC", ".TEX", ".TXT", ".Y", ""}; 14".RC", ".TEX", ".TXT", ".Y", ""};
15 15
16static char *text_prefix[]={ 16static const char *text_prefix[]={
17"AUTOEXEC.", "CHANGES", "COPYING", "CONFIG.", "CREDITS", "FAQ", "FILE_ID.DIZ", 17"AUTOEXEC.", "CHANGES", "COPYING", "CONFIG.", "CREDITS", "FAQ", "FILE_ID.DIZ",
18"MAKEFILE", "READ.ME", "README", "TERMCAP", ""}; 18"MAKEFILE", "READ.ME", "README", "TERMCAP", ""};
19 19
20void hpfs_decide_conv(struct inode *inode, unsigned char *name, unsigned len) 20void hpfs_decide_conv(struct inode *inode, const unsigned char *name, unsigned len)
21{ 21{
22 struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); 22 struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
23 int i; 23 int i;
@@ -71,7 +71,7 @@ static inline unsigned char locase(unsigned char *dir, unsigned char a)
71 return dir[a]; 71 return dir[a];
72} 72}
73 73
74int hpfs_chk_name(unsigned char *name, unsigned *len) 74int hpfs_chk_name(const unsigned char *name, unsigned *len)
75{ 75{
76 int i; 76 int i;
77 if (*len > 254) return -ENAMETOOLONG; 77 if (*len > 254) return -ENAMETOOLONG;
@@ -83,10 +83,10 @@ int hpfs_chk_name(unsigned char *name, unsigned *len)
83 return 0; 83 return 0;
84} 84}
85 85
86char *hpfs_translate_name(struct super_block *s, unsigned char *from, 86unsigned char *hpfs_translate_name(struct super_block *s, unsigned char *from,
87 unsigned len, int lc, int lng) 87 unsigned len, int lc, int lng)
88{ 88{
89 char *to; 89 unsigned char *to;
90 int i; 90 int i;
91 if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) { 91 if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) {
92 printk("HPFS: Long name flag mismatch - name "); 92 printk("HPFS: Long name flag mismatch - name ");
@@ -103,8 +103,9 @@ char *hpfs_translate_name(struct super_block *s, unsigned char *from,
103 return to; 103 return to;
104} 104}
105 105
106int hpfs_compare_names(struct super_block *s, unsigned char *n1, unsigned l1, 106int hpfs_compare_names(struct super_block *s,
107 unsigned char *n2, unsigned l2, int last) 107 const unsigned char *n1, unsigned l1,
108 const unsigned char *n2, unsigned l2, int last)
108{ 109{
109 unsigned l = l1 < l2 ? l1 : l2; 110 unsigned l = l1 < l2 ? l1 : l2;
110 unsigned i; 111 unsigned i;
@@ -120,7 +121,7 @@ int hpfs_compare_names(struct super_block *s, unsigned char *n1, unsigned l1,
120 return 0; 121 return 0;
121} 122}
122 123
123int hpfs_is_name_long(unsigned char *name, unsigned len) 124int hpfs_is_name_long(const unsigned char *name, unsigned len)
124{ 125{
125 int i,j; 126 int i,j;
126 for (i = 0; i < len && name[i] != '.'; i++) 127 for (i = 0; i < len && name[i] != '.'; i++)
@@ -134,7 +135,7 @@ int hpfs_is_name_long(unsigned char *name, unsigned len)
134 135
135/* OS/2 clears dots and spaces at the end of file name, so we have to */ 136/* OS/2 clears dots and spaces at the end of file name, so we have to */
136 137
137void hpfs_adjust_length(unsigned char *name, unsigned *len) 138void hpfs_adjust_length(const unsigned char *name, unsigned *len)
138{ 139{
139 if (!*len) return; 140 if (!*len) return;
140 if (*len == 1 && name[0] == '.') return; 141 if (*len == 1 && name[0] == '.') return;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 82b9c4ba9ed0..11c2b4080f65 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -11,7 +11,7 @@
11 11
12static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 12static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
13{ 13{
14 const char *name = dentry->d_name.name; 14 const unsigned char *name = dentry->d_name.name;
15 unsigned len = dentry->d_name.len; 15 unsigned len = dentry->d_name.len;
16 struct quad_buffer_head qbh0; 16 struct quad_buffer_head qbh0;
17 struct buffer_head *bh; 17 struct buffer_head *bh;
@@ -24,7 +24,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
24 int r; 24 int r;
25 struct hpfs_dirent dee; 25 struct hpfs_dirent dee;
26 int err; 26 int err;
27 if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err; 27 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
28 lock_kernel(); 28 lock_kernel();
29 err = -ENOSPC; 29 err = -ENOSPC;
30 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); 30 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
@@ -62,7 +62,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
62 result->i_mode &= ~0222; 62 result->i_mode &= ~0222;
63 63
64 mutex_lock(&hpfs_i(dir)->i_mutex); 64 mutex_lock(&hpfs_i(dir)->i_mutex);
65 r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); 65 r = hpfs_add_dirent(dir, name, len, &dee, 0);
66 if (r == 1) 66 if (r == 1)
67 goto bail3; 67 goto bail3;
68 if (r == -1) { 68 if (r == -1) {
@@ -121,7 +121,7 @@ bail:
121 121
122static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) 122static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
123{ 123{
124 const char *name = dentry->d_name.name; 124 const unsigned char *name = dentry->d_name.name;
125 unsigned len = dentry->d_name.len; 125 unsigned len = dentry->d_name.len;
126 struct inode *result = NULL; 126 struct inode *result = NULL;
127 struct buffer_head *bh; 127 struct buffer_head *bh;
@@ -130,7 +130,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
130 int r; 130 int r;
131 struct hpfs_dirent dee; 131 struct hpfs_dirent dee;
132 int err; 132 int err;
133 if ((err = hpfs_chk_name((char *)name, &len))) 133 if ((err = hpfs_chk_name(name, &len)))
134 return err==-ENOENT ? -EINVAL : err; 134 return err==-ENOENT ? -EINVAL : err;
135 lock_kernel(); 135 lock_kernel();
136 err = -ENOSPC; 136 err = -ENOSPC;
@@ -155,7 +155,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
155 result->i_op = &hpfs_file_iops; 155 result->i_op = &hpfs_file_iops;
156 result->i_fop = &hpfs_file_ops; 156 result->i_fop = &hpfs_file_ops;
157 result->i_nlink = 1; 157 result->i_nlink = 1;
158 hpfs_decide_conv(result, (char *)name, len); 158 hpfs_decide_conv(result, name, len);
159 hpfs_i(result)->i_parent_dir = dir->i_ino; 159 hpfs_i(result)->i_parent_dir = dir->i_ino;
160 result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, dee.creation_date); 160 result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, dee.creation_date);
161 result->i_ctime.tv_nsec = 0; 161 result->i_ctime.tv_nsec = 0;
@@ -170,7 +170,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
170 hpfs_i(result)->mmu_private = 0; 170 hpfs_i(result)->mmu_private = 0;
171 171
172 mutex_lock(&hpfs_i(dir)->i_mutex); 172 mutex_lock(&hpfs_i(dir)->i_mutex);
173 r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); 173 r = hpfs_add_dirent(dir, name, len, &dee, 0);
174 if (r == 1) 174 if (r == 1)
175 goto bail2; 175 goto bail2;
176 if (r == -1) { 176 if (r == -1) {
@@ -211,7 +211,7 @@ bail:
211 211
212static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) 212static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
213{ 213{
214 const char *name = dentry->d_name.name; 214 const unsigned char *name = dentry->d_name.name;
215 unsigned len = dentry->d_name.len; 215 unsigned len = dentry->d_name.len;
216 struct buffer_head *bh; 216 struct buffer_head *bh;
217 struct fnode *fnode; 217 struct fnode *fnode;
@@ -220,7 +220,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
220 struct hpfs_dirent dee; 220 struct hpfs_dirent dee;
221 struct inode *result = NULL; 221 struct inode *result = NULL;
222 int err; 222 int err;
223 if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err; 223 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
224 if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM; 224 if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
225 if (!new_valid_dev(rdev)) 225 if (!new_valid_dev(rdev))
226 return -EINVAL; 226 return -EINVAL;
@@ -256,7 +256,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
256 init_special_inode(result, mode, rdev); 256 init_special_inode(result, mode, rdev);
257 257
258 mutex_lock(&hpfs_i(dir)->i_mutex); 258 mutex_lock(&hpfs_i(dir)->i_mutex);
259 r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); 259 r = hpfs_add_dirent(dir, name, len, &dee, 0);
260 if (r == 1) 260 if (r == 1)
261 goto bail2; 261 goto bail2;
262 if (r == -1) { 262 if (r == -1) {
@@ -289,7 +289,7 @@ bail:
289 289
290static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink) 290static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink)
291{ 291{
292 const char *name = dentry->d_name.name; 292 const unsigned char *name = dentry->d_name.name;
293 unsigned len = dentry->d_name.len; 293 unsigned len = dentry->d_name.len;
294 struct buffer_head *bh; 294 struct buffer_head *bh;
295 struct fnode *fnode; 295 struct fnode *fnode;
@@ -298,7 +298,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
298 struct hpfs_dirent dee; 298 struct hpfs_dirent dee;
299 struct inode *result; 299 struct inode *result;
300 int err; 300 int err;
301 if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err; 301 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
302 lock_kernel(); 302 lock_kernel();
303 if (hpfs_sb(dir->i_sb)->sb_eas < 2) { 303 if (hpfs_sb(dir->i_sb)->sb_eas < 2) {
304 unlock_kernel(); 304 unlock_kernel();
@@ -335,7 +335,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
335 result->i_data.a_ops = &hpfs_symlink_aops; 335 result->i_data.a_ops = &hpfs_symlink_aops;
336 336
337 mutex_lock(&hpfs_i(dir)->i_mutex); 337 mutex_lock(&hpfs_i(dir)->i_mutex);
338 r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0); 338 r = hpfs_add_dirent(dir, name, len, &dee, 0);
339 if (r == 1) 339 if (r == 1)
340 goto bail2; 340 goto bail2;
341 if (r == -1) { 341 if (r == -1) {
@@ -345,7 +345,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
345 fnode->len = len; 345 fnode->len = len;
346 memcpy(fnode->name, name, len > 15 ? 15 : len); 346 memcpy(fnode->name, name, len > 15 ? 15 : len);
347 fnode->up = dir->i_ino; 347 fnode->up = dir->i_ino;
348 hpfs_set_ea(result, fnode, "SYMLINK", (char *)symlink, strlen(symlink)); 348 hpfs_set_ea(result, fnode, "SYMLINK", symlink, strlen(symlink));
349 mark_buffer_dirty(bh); 349 mark_buffer_dirty(bh);
350 brelse(bh); 350 brelse(bh);
351 351
@@ -369,7 +369,7 @@ bail:
369 369
370static int hpfs_unlink(struct inode *dir, struct dentry *dentry) 370static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
371{ 371{
372 const char *name = dentry->d_name.name; 372 const unsigned char *name = dentry->d_name.name;
373 unsigned len = dentry->d_name.len; 373 unsigned len = dentry->d_name.len;
374 struct quad_buffer_head qbh; 374 struct quad_buffer_head qbh;
375 struct hpfs_dirent *de; 375 struct hpfs_dirent *de;
@@ -381,12 +381,12 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
381 int err; 381 int err;
382 382
383 lock_kernel(); 383 lock_kernel();
384 hpfs_adjust_length((char *)name, &len); 384 hpfs_adjust_length(name, &len);
385again: 385again:
386 mutex_lock(&hpfs_i(inode)->i_parent_mutex); 386 mutex_lock(&hpfs_i(inode)->i_parent_mutex);
387 mutex_lock(&hpfs_i(dir)->i_mutex); 387 mutex_lock(&hpfs_i(dir)->i_mutex);
388 err = -ENOENT; 388 err = -ENOENT;
389 de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *)name, len, &dno, &qbh); 389 de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
390 if (!de) 390 if (!de)
391 goto out; 391 goto out;
392 392
@@ -413,22 +413,25 @@ again:
413 413
414 mutex_unlock(&hpfs_i(dir)->i_mutex); 414 mutex_unlock(&hpfs_i(dir)->i_mutex);
415 mutex_unlock(&hpfs_i(inode)->i_parent_mutex); 415 mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
416 d_drop(dentry); 416 dentry_unhash(dentry);
417 spin_lock(&dentry->d_lock); 417 if (!d_unhashed(dentry)) {
418 if (atomic_read(&dentry->d_count) > 1 || 418 dput(dentry);
419 generic_permission(inode, MAY_WRITE, NULL) || 419 unlock_kernel();
420 return -ENOSPC;
421 }
422 if (generic_permission(inode, MAY_WRITE, NULL) ||
420 !S_ISREG(inode->i_mode) || 423 !S_ISREG(inode->i_mode) ||
421 get_write_access(inode)) { 424 get_write_access(inode)) {
422 spin_unlock(&dentry->d_lock);
423 d_rehash(dentry); 425 d_rehash(dentry);
426 dput(dentry);
424 } else { 427 } else {
425 struct iattr newattrs; 428 struct iattr newattrs;
426 spin_unlock(&dentry->d_lock);
427 /*printk("HPFS: truncating file before delete.\n");*/ 429 /*printk("HPFS: truncating file before delete.\n");*/
428 newattrs.ia_size = 0; 430 newattrs.ia_size = 0;
429 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; 431 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
430 err = notify_change(dentry, &newattrs); 432 err = notify_change(dentry, &newattrs);
431 put_write_access(inode); 433 put_write_access(inode);
434 dput(dentry);
432 if (!err) 435 if (!err)
433 goto again; 436 goto again;
434 } 437 }
@@ -451,7 +454,7 @@ out:
451 454
452static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) 455static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
453{ 456{
454 const char *name = dentry->d_name.name; 457 const unsigned char *name = dentry->d_name.name;
455 unsigned len = dentry->d_name.len; 458 unsigned len = dentry->d_name.len;
456 struct quad_buffer_head qbh; 459 struct quad_buffer_head qbh;
457 struct hpfs_dirent *de; 460 struct hpfs_dirent *de;
@@ -462,12 +465,12 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
462 int err; 465 int err;
463 int r; 466 int r;
464 467
465 hpfs_adjust_length((char *)name, &len); 468 hpfs_adjust_length(name, &len);
466 lock_kernel(); 469 lock_kernel();
467 mutex_lock(&hpfs_i(inode)->i_parent_mutex); 470 mutex_lock(&hpfs_i(inode)->i_parent_mutex);
468 mutex_lock(&hpfs_i(dir)->i_mutex); 471 mutex_lock(&hpfs_i(dir)->i_mutex);
469 err = -ENOENT; 472 err = -ENOENT;
470 de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *)name, len, &dno, &qbh); 473 de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
471 if (!de) 474 if (!de)
472 goto out; 475 goto out;
473 476
@@ -546,10 +549,10 @@ const struct address_space_operations hpfs_symlink_aops = {
546static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, 549static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
547 struct inode *new_dir, struct dentry *new_dentry) 550 struct inode *new_dir, struct dentry *new_dentry)
548{ 551{
549 char *old_name = (char *)old_dentry->d_name.name; 552 const unsigned char *old_name = old_dentry->d_name.name;
550 int old_len = old_dentry->d_name.len; 553 unsigned old_len = old_dentry->d_name.len;
551 char *new_name = (char *)new_dentry->d_name.name; 554 const unsigned char *new_name = new_dentry->d_name.name;
552 int new_len = new_dentry->d_name.len; 555 unsigned new_len = new_dentry->d_name.len;
553 struct inode *i = old_dentry->d_inode; 556 struct inode *i = old_dentry->d_inode;
554 struct inode *new_inode = new_dentry->d_inode; 557 struct inode *new_inode = new_dentry->d_inode;
555 struct quad_buffer_head qbh, qbh1; 558 struct quad_buffer_head qbh, qbh1;
@@ -560,9 +563,9 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
560 struct buffer_head *bh; 563 struct buffer_head *bh;
561 struct fnode *fnode; 564 struct fnode *fnode;
562 int err; 565 int err;
563 if ((err = hpfs_chk_name((char *)new_name, &new_len))) return err; 566 if ((err = hpfs_chk_name(new_name, &new_len))) return err;
564 err = 0; 567 err = 0;
565 hpfs_adjust_length((char *)old_name, &old_len); 568 hpfs_adjust_length(old_name, &old_len);
566 569
567 lock_kernel(); 570 lock_kernel();
568 /* order doesn't matter, due to VFS exclusion */ 571 /* order doesn't matter, due to VFS exclusion */
@@ -579,7 +582,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
579 goto end1; 582 goto end1;
580 } 583 }
581 584
582 if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, (char *)old_name, old_len, &dno, &qbh))) { 585 if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
583 hpfs_error(i->i_sb, "lookup succeeded but map dirent failed"); 586 hpfs_error(i->i_sb, "lookup succeeded but map dirent failed");
584 err = -ENOENT; 587 err = -ENOENT;
585 goto end1; 588 goto end1;
@@ -590,7 +593,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
590 if (new_inode) { 593 if (new_inode) {
591 int r; 594 int r;
592 if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 1)) != 2) { 595 if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 1)) != 2) {
593 if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, (char *)new_name, new_len, NULL, &qbh1))) { 596 if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, new_name, new_len, NULL, &qbh1))) {
594 clear_nlink(new_inode); 597 clear_nlink(new_inode);
595 copy_de(nde, &de); 598 copy_de(nde, &de);
596 memcpy(nde->name, new_name, new_len); 599 memcpy(nde->name, new_name, new_len);
@@ -618,7 +621,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
618 } 621 }
619 622
620 if (new_dir == old_dir) 623 if (new_dir == old_dir)
621 if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, (char *)old_name, old_len, &dno, &qbh))) { 624 if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
622 hpfs_unlock_creation(i->i_sb); 625 hpfs_unlock_creation(i->i_sb);
623 hpfs_error(i->i_sb, "lookup succeeded but map dirent failed at #2"); 626 hpfs_error(i->i_sb, "lookup succeeded but map dirent failed at #2");
624 err = -ENOENT; 627 err = -ENOENT;
@@ -648,7 +651,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
648 brelse(bh); 651 brelse(bh);
649 } 652 }
650 hpfs_i(i)->i_conv = hpfs_sb(i->i_sb)->sb_conv; 653 hpfs_i(i)->i_conv = hpfs_sb(i->i_sb)->sb_conv;
651 hpfs_decide_conv(i, (char *)new_name, new_len); 654 hpfs_decide_conv(i, new_name, new_len);
652end1: 655end1:
653 if (old_dir != new_dir) 656 if (old_dir != new_dir)
654 mutex_unlock(&hpfs_i(new_dir)->i_mutex); 657 mutex_unlock(&hpfs_i(new_dir)->i_mutex);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index cadc4ce48656..aa53842c599c 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -15,6 +15,7 @@
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/smp_lock.h> 16#include <linux/smp_lock.h>
17#include <linux/bitmap.h> 17#include <linux/bitmap.h>
18#include <linux/slab.h>
18 19
19/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ 20/* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
20 21
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index a5089a6dd67a..2e4dfa8593da 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -646,22 +646,27 @@ static const struct super_operations hppfs_sbops = {
646static int hppfs_readlink(struct dentry *dentry, char __user *buffer, 646static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
647 int buflen) 647 int buflen)
648{ 648{
649 struct dentry *proc_dentry; 649 struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
650
651 proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
652 return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer, 650 return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer,
653 buflen); 651 buflen);
654} 652}
655 653
656static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) 654static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
657{ 655{
658 struct dentry *proc_dentry; 656 struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
659
660 proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
661 657
662 return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd); 658 return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd);
663} 659}
664 660
661static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd,
662 void *cookie)
663{
664 struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
665
666 if (proc_dentry->d_inode->i_op->put_link)
667 proc_dentry->d_inode->i_op->put_link(proc_dentry, nd, cookie);
668}
669
665static const struct inode_operations hppfs_dir_iops = { 670static const struct inode_operations hppfs_dir_iops = {
666 .lookup = hppfs_lookup, 671 .lookup = hppfs_lookup,
667}; 672};
@@ -669,6 +674,7 @@ static const struct inode_operations hppfs_dir_iops = {
669static const struct inode_operations hppfs_link_iops = { 674static const struct inode_operations hppfs_link_iops = {
670 .readlink = hppfs_readlink, 675 .readlink = hppfs_readlink,
671 .follow_link = hppfs_follow_link, 676 .follow_link = hppfs_follow_link,
677 .put_link = hppfs_put_link,
672}; 678};
673 679
674static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) 680static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
@@ -712,7 +718,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
712 struct vfsmount *proc_mnt; 718 struct vfsmount *proc_mnt;
713 int err = -ENOENT; 719 int err = -ENOENT;
714 720
715 proc_mnt = do_kern_mount("proc", 0, "proc", NULL); 721 proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt);
716 if (IS_ERR(proc_mnt)) 722 if (IS_ERR(proc_mnt))
717 goto out; 723 goto out;
718 724
diff --git a/fs/inode.c b/fs/inode.c
index 03dfeb2e3928..407bf392e20a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -8,7 +8,6 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/dcache.h> 9#include <linux/dcache.h>
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/quotaops.h>
12#include <linux/slab.h> 11#include <linux/slab.h>
13#include <linux/writeback.h> 12#include <linux/writeback.h>
14#include <linux/module.h> 13#include <linux/module.h>
@@ -314,7 +313,6 @@ void clear_inode(struct inode *inode)
314 BUG_ON(!(inode->i_state & I_FREEING)); 313 BUG_ON(!(inode->i_state & I_FREEING));
315 BUG_ON(inode->i_state & I_CLEAR); 314 BUG_ON(inode->i_state & I_CLEAR);
316 inode_sync_wait(inode); 315 inode_sync_wait(inode);
317 vfs_dq_drop(inode);
318 if (inode->i_sb->s_op->clear_inode) 316 if (inode->i_sb->s_op->clear_inode)
319 inode->i_sb->s_op->clear_inode(inode); 317 inode->i_sb->s_op->clear_inode(inode);
320 if (S_ISBLK(inode->i_mode) && inode->i_bdev) 318 if (S_ISBLK(inode->i_mode) && inode->i_bdev)
@@ -1211,8 +1209,6 @@ void generic_delete_inode(struct inode *inode)
1211 1209
1212 if (op->delete_inode) { 1210 if (op->delete_inode) {
1213 void (*delete)(struct inode *) = op->delete_inode; 1211 void (*delete)(struct inode *) = op->delete_inode;
1214 if (!is_bad_inode(inode))
1215 vfs_dq_init(inode);
1216 /* Filesystems implementing their own 1212 /* Filesystems implementing their own
1217 * s_op->delete_inode are required to call 1213 * s_op->delete_inode are required to call
1218 * truncate_inode_pages and clear_inode() 1214 * truncate_inode_pages and clear_inode()
diff --git a/fs/internal.h b/fs/internal.h
index e96a1667d749..8a03a5447bdf 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -70,6 +70,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
70 70
71extern void __init mnt_init(void); 71extern void __init mnt_init(void);
72 72
73extern spinlock_t vfsmount_lock;
74
73/* 75/*
74 * fs_struct.c 76 * fs_struct.c
75 */ 77 */
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 6c751106c2e5..7faefb4da939 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -228,14 +228,23 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
228 228
229#ifdef CONFIG_BLOCK 229#ifdef CONFIG_BLOCK
230 230
231#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits) 231static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
232#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits); 232{
233 return (offset >> inode->i_blkbits);
234}
235
236static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
237{
238 return (blk << inode->i_blkbits);
239}
233 240
234/** 241/**
235 * __generic_block_fiemap - FIEMAP for block based inodes (no locking) 242 * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
236 * @inode - the inode to map 243 * @inode: the inode to map
237 * @arg - the pointer to userspace where we copy everything to 244 * @fieinfo: the fiemap info struct that will be passed back to userspace
238 * @get_block - the fs's get_block function 245 * @start: where to start mapping in the inode
246 * @len: how much space to map
247 * @get_block: the fs's get_block function
239 * 248 *
240 * This does FIEMAP for block based inodes. Basically it will just loop 249 * This does FIEMAP for block based inodes. Basically it will just loop
241 * through get_block until we hit the number of extents we want to map, or we 250 * through get_block until we hit the number of extents we want to map, or we
@@ -250,58 +259,63 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
250 */ 259 */
251 260
252int __generic_block_fiemap(struct inode *inode, 261int __generic_block_fiemap(struct inode *inode,
253 struct fiemap_extent_info *fieinfo, u64 start, 262 struct fiemap_extent_info *fieinfo, loff_t start,
254 u64 len, get_block_t *get_block) 263 loff_t len, get_block_t *get_block)
255{ 264{
256 struct buffer_head tmp; 265 struct buffer_head map_bh;
257 unsigned long long start_blk; 266 sector_t start_blk, last_blk;
258 long long length = 0, map_len = 0; 267 loff_t isize = i_size_read(inode);
259 u64 logical = 0, phys = 0, size = 0; 268 u64 logical = 0, phys = 0, size = 0;
260 u32 flags = FIEMAP_EXTENT_MERGED; 269 u32 flags = FIEMAP_EXTENT_MERGED;
261 int ret = 0, past_eof = 0, whole_file = 0; 270 bool past_eof = false, whole_file = false;
271 int ret = 0;
262 272
263 if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC))) 273 ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
274 if (ret)
264 return ret; 275 return ret;
265 276
266 start_blk = logical_to_blk(inode, start); 277 /*
267 278 * Either the i_mutex or other appropriate locking needs to be held
268 length = (long long)min_t(u64, len, i_size_read(inode)); 279 * since we expect isize to not change at all through the duration of
269 if (length < len) 280 * this call.
270 whole_file = 1; 281 */
282 if (len >= isize) {
283 whole_file = true;
284 len = isize;
285 }
271 286
272 map_len = length; 287 start_blk = logical_to_blk(inode, start);
288 last_blk = logical_to_blk(inode, start + len - 1);
273 289
274 do { 290 do {
275 /* 291 /*
276 * we set b_size to the total size we want so it will map as 292 * we set b_size to the total size we want so it will map as
277 * many contiguous blocks as possible at once 293 * many contiguous blocks as possible at once
278 */ 294 */
279 memset(&tmp, 0, sizeof(struct buffer_head)); 295 memset(&map_bh, 0, sizeof(struct buffer_head));
280 tmp.b_size = map_len; 296 map_bh.b_size = len;
281 297
282 ret = get_block(inode, start_blk, &tmp, 0); 298 ret = get_block(inode, start_blk, &map_bh, 0);
283 if (ret) 299 if (ret)
284 break; 300 break;
285 301
286 /* HOLE */ 302 /* HOLE */
287 if (!buffer_mapped(&tmp)) { 303 if (!buffer_mapped(&map_bh)) {
288 length -= blk_to_logical(inode, 1);
289 start_blk++; 304 start_blk++;
290 305
291 /* 306 /*
292 * we want to handle the case where there is an 307 * We want to handle the case where there is an
293 * allocated block at the front of the file, and then 308 * allocated block at the front of the file, and then
294 * nothing but holes up to the end of the file properly, 309 * nothing but holes up to the end of the file properly,
295 * to make sure that extent at the front gets properly 310 * to make sure that extent at the front gets properly
296 * marked with FIEMAP_EXTENT_LAST 311 * marked with FIEMAP_EXTENT_LAST
297 */ 312 */
298 if (!past_eof && 313 if (!past_eof &&
299 blk_to_logical(inode, start_blk) >= 314 blk_to_logical(inode, start_blk) >= isize)
300 blk_to_logical(inode, 0)+i_size_read(inode))
301 past_eof = 1; 315 past_eof = 1;
302 316
303 /* 317 /*
304 * first hole after going past the EOF, this is our 318 * First hole after going past the EOF, this is our
305 * last extent 319 * last extent
306 */ 320 */
307 if (past_eof && size) { 321 if (past_eof && size) {
@@ -309,15 +323,18 @@ int __generic_block_fiemap(struct inode *inode,
309 ret = fiemap_fill_next_extent(fieinfo, logical, 323 ret = fiemap_fill_next_extent(fieinfo, logical,
310 phys, size, 324 phys, size,
311 flags); 325 flags);
312 break; 326 } else if (size) {
327 ret = fiemap_fill_next_extent(fieinfo, logical,
328 phys, size, flags);
329 size = 0;
313 } 330 }
314 331
315 /* if we have holes up to/past EOF then we're done */ 332 /* if we have holes up to/past EOF then we're done */
316 if (length <= 0 || past_eof) 333 if (start_blk > last_blk || past_eof || ret)
317 break; 334 break;
318 } else { 335 } else {
319 /* 336 /*
320 * we have gone over the length of what we wanted to 337 * We have gone over the length of what we wanted to
321 * map, and it wasn't the entire file, so add the extent 338 * map, and it wasn't the entire file, so add the extent
322 * we got last time and exit. 339 * we got last time and exit.
323 * 340 *
@@ -331,7 +348,7 @@ int __generic_block_fiemap(struct inode *inode,
331 * are good to go, just add the extent to the fieinfo 348 * are good to go, just add the extent to the fieinfo
332 * and break 349 * and break
333 */ 350 */
334 if (length <= 0 && !whole_file) { 351 if (start_blk > last_blk && !whole_file) {
335 ret = fiemap_fill_next_extent(fieinfo, logical, 352 ret = fiemap_fill_next_extent(fieinfo, logical,
336 phys, size, 353 phys, size,
337 flags); 354 flags);
@@ -351,11 +368,10 @@ int __generic_block_fiemap(struct inode *inode,
351 } 368 }
352 369
353 logical = blk_to_logical(inode, start_blk); 370 logical = blk_to_logical(inode, start_blk);
354 phys = blk_to_logical(inode, tmp.b_blocknr); 371 phys = blk_to_logical(inode, map_bh.b_blocknr);
355 size = tmp.b_size; 372 size = map_bh.b_size;
356 flags = FIEMAP_EXTENT_MERGED; 373 flags = FIEMAP_EXTENT_MERGED;
357 374
358 length -= tmp.b_size;
359 start_blk += logical_to_blk(inode, size); 375 start_blk += logical_to_blk(inode, size);
360 376
361 /* 377 /*
@@ -363,15 +379,13 @@ int __generic_block_fiemap(struct inode *inode,
363 * soon as we find a hole that the last extent we found 379 * soon as we find a hole that the last extent we found
364 * is marked with FIEMAP_EXTENT_LAST 380 * is marked with FIEMAP_EXTENT_LAST
365 */ 381 */
366 if (!past_eof && 382 if (!past_eof && logical + size >= isize)
367 logical+size >= 383 past_eof = true;
368 blk_to_logical(inode, 0)+i_size_read(inode))
369 past_eof = 1;
370 } 384 }
371 cond_resched(); 385 cond_resched();
372 } while (1); 386 } while (1);
373 387
374 /* if ret is 1 then we just hit the end of the extent array */ 388 /* If ret is 1 then we just hit the end of the extent array */
375 if (ret == 1) 389 if (ret == 1)
376 ret = 0; 390 ret = 0;
377 391
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c7c0b28d7d21..748cfb92dcc6 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -19,6 +19,7 @@
19 * See also Documentation/block/ioprio.txt 19 * See also Documentation/block/ioprio.txt
20 * 20 *
21 */ 21 */
22#include <linux/gfp.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/ioprio.h> 24#include <linux/ioprio.h>
24#include <linux/blkdev.h> 25#include <linux/blkdev.h>
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 8ba5441063be..b9ab69b3a482 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -11,6 +11,7 @@
11 * isofs directory handling functions 11 * isofs directory handling functions
12 */ 12 */
13#include <linux/smp_lock.h> 13#include <linux/smp_lock.h>
14#include <linux/gfp.h>
14#include "isofs.h" 15#include "isofs.h"
15 16
16int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode) 17int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index eaa831311c9c..ab438beb867c 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
10#include <linux/gfp.h>
10#include "isofs.h" 11#include "isofs.h"
11 12
12/* 13/*
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 4bd882548c45..ecb44c94ba8d 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -17,7 +17,6 @@
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/jbd.h> 18#include <linux/jbd.h>
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h> 20#include <linux/mm.h>
22#include <linux/pagemap.h> 21#include <linux/pagemap.h>
23#include <linux/bio.h> 22#include <linux/bio.h>
@@ -862,12 +861,12 @@ restart_loop:
862 /* A buffer which has been freed while still being 861 /* A buffer which has been freed while still being
863 * journaled by a previous transaction may end up still 862 * journaled by a previous transaction may end up still
864 * being dirty here, but we want to avoid writing back 863 * being dirty here, but we want to avoid writing back
865 * that buffer in the future now that the last use has 864 * that buffer in the future after the "add to orphan"
866 * been committed. That's not only a performance gain, 865 * operation been committed, That's not only a performance
867 * it also stops aliasing problems if the buffer is left 866 * gain, it also stops aliasing problems if the buffer is
868 * behind for writeback and gets reallocated for another 867 * left behind for writeback and gets reallocated for another
869 * use in a different page. */ 868 * use in a different page. */
870 if (buffer_freed(bh)) { 869 if (buffer_freed(bh) && !jh->b_next_transaction) {
871 clear_buffer_freed(bh); 870 clear_buffer_freed(bh);
872 clear_buffer_jbddirty(bh); 871 clear_buffer_jbddirty(bh);
873 } 872 }
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index cb1a49ae605e..54c9bc9e1b17 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,7 +20,6 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/jbd.h> 21#include <linux/jbd.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/slab.h>
24#endif 23#endif
25 24
26/* 25/*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 006f9ad838a2..5ae71e75a491 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1398,7 +1398,7 @@ int journal_stop(handle_t *handle)
1398 * the case where our storage is so fast that it is more optimal to go 1398 * the case where our storage is so fast that it is more optimal to go
1399 * ahead and force a flush and wait for the transaction to be committed 1399 * ahead and force a flush and wait for the transaction to be committed
1400 * than it is to wait for an arbitrary amount of time for new writers to 1400 * than it is to wait for an arbitrary amount of time for new writers to
1401 * join the transaction. We acheive this by measuring how long it takes 1401 * join the transaction. We achieve this by measuring how long it takes
1402 * to commit a transaction, and compare it with how long this 1402 * to commit a transaction, and compare it with how long this
1403 * transaction has been running, and if run time < commit time then we 1403 * transaction has been running, and if run time < commit time then we
1404 * sleep for the delta and commit. This greatly helps super fast disks 1404 * sleep for the delta and commit. This greatly helps super fast disks
@@ -1864,6 +1864,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1864 if (!jh) 1864 if (!jh)
1865 goto zap_buffer_no_jh; 1865 goto zap_buffer_no_jh;
1866 1866
1867 /*
1868 * We cannot remove the buffer from checkpoint lists until the
1869 * transaction adding inode to orphan list (let's call it T)
1870 * is committed. Otherwise if the transaction changing the
1871 * buffer would be cleaned from the journal before T is
1872 * committed, a crash will cause that the correct contents of
1873 * the buffer will be lost. On the other hand we have to
1874 * clear the buffer dirty bit at latest at the moment when the
1875 * transaction marking the buffer as freed in the filesystem
1876 * structures is committed because from that moment on the
1877 * buffer can be reallocated and used by a different page.
1878 * Since the block hasn't been freed yet but the inode has
1879 * already been added to orphan list, it is safe for us to add
1880 * the buffer to BJ_Forget list of the newest transaction.
1881 */
1867 transaction = jh->b_transaction; 1882 transaction = jh->b_transaction;
1868 if (transaction == NULL) { 1883 if (transaction == NULL) {
1869 /* First case: not on any transaction. If it 1884 /* First case: not on any transaction. If it
@@ -1929,16 +1944,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1929 goto zap_buffer; 1944 goto zap_buffer;
1930 } 1945 }
1931 /* 1946 /*
1932 * If it is committing, we simply cannot touch it. We 1947 * The buffer is committing, we simply cannot touch
1933 * can remove it's next_transaction pointer from the 1948 * it. So we just set j_next_transaction to the
1934 * running transaction if that is set, but nothing 1949 * running transaction (if there is one) and mark
1935 * else. */ 1950 * buffer as freed so that commit code knows it should
1951 * clear dirty bits when it is done with the buffer.
1952 */
1936 set_buffer_freed(bh); 1953 set_buffer_freed(bh);
1937 if (jh->b_next_transaction) { 1954 if (journal->j_running_transaction && buffer_jbddirty(bh))
1938 J_ASSERT(jh->b_next_transaction == 1955 jh->b_next_transaction = journal->j_running_transaction;
1939 journal->j_running_transaction);
1940 jh->b_next_transaction = NULL;
1941 }
1942 journal_put_journal_head(jh); 1956 journal_put_journal_head(jh);
1943 spin_unlock(&journal->j_list_lock); 1957 spin_unlock(&journal->j_list_lock);
1944 jbd_unlock_bh_state(bh); 1958 jbd_unlock_bh_state(bh);
@@ -2120,7 +2134,7 @@ void journal_file_buffer(struct journal_head *jh,
2120 */ 2134 */
2121void __journal_refile_buffer(struct journal_head *jh) 2135void __journal_refile_buffer(struct journal_head *jh)
2122{ 2136{
2123 int was_dirty; 2137 int was_dirty, jlist;
2124 struct buffer_head *bh = jh2bh(jh); 2138 struct buffer_head *bh = jh2bh(jh);
2125 2139
2126 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); 2140 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -2142,8 +2156,13 @@ void __journal_refile_buffer(struct journal_head *jh)
2142 __journal_temp_unlink_buffer(jh); 2156 __journal_temp_unlink_buffer(jh);
2143 jh->b_transaction = jh->b_next_transaction; 2157 jh->b_transaction = jh->b_next_transaction;
2144 jh->b_next_transaction = NULL; 2158 jh->b_next_transaction = NULL;
2145 __journal_file_buffer(jh, jh->b_transaction, 2159 if (buffer_freed(bh))
2146 jh->b_modified ? BJ_Metadata : BJ_Reserved); 2160 jlist = BJ_Forget;
2161 else if (jh->b_modified)
2162 jlist = BJ_Metadata;
2163 else
2164 jlist = BJ_Reserved;
2165 __journal_file_buffer(jh, jh->b_transaction, jlist);
2147 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); 2166 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
2148 2167
2149 if (was_dirty) 2168 if (was_dirty)
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index ca0f5eb62b20..30beb11ef928 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -22,6 +22,7 @@
22#include <linux/jbd2.h> 22#include <linux/jbd2.h>
23#include <linux/errno.h> 23#include <linux/errno.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/blkdev.h>
25#include <trace/events/jbd2.h> 26#include <trace/events/jbd2.h>
26 27
27/* 28/*
@@ -506,6 +507,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
506 if (blocknr < journal->j_tail) 507 if (blocknr < journal->j_tail)
507 freed = freed + journal->j_last - journal->j_first; 508 freed = freed + journal->j_last - journal->j_first;
508 509
510 trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
509 jbd_debug(1, 511 jbd_debug(1,
510 "Cleaning journal tail from %d to %d (offset %lu), " 512 "Cleaning journal tail from %d to %d (offset %lu), "
511 "freeing %lu\n", 513 "freeing %lu\n",
@@ -515,6 +517,20 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
515 journal->j_tail_sequence = first_tid; 517 journal->j_tail_sequence = first_tid;
516 journal->j_tail = blocknr; 518 journal->j_tail = blocknr;
517 spin_unlock(&journal->j_state_lock); 519 spin_unlock(&journal->j_state_lock);
520
521 /*
522 * If there is an external journal, we need to make sure that
523 * any data blocks that were recently written out --- perhaps
524 * by jbd2_log_do_checkpoint() --- are flushed out before we
525 * drop the transactions from the external journal. It's
526 * unlikely this will be necessary, especially with a
527 * appropriately sized journal, but we need this to guarantee
528 * correctness. Fortunately jbd2_cleanup_journal_tail()
529 * doesn't get called all that often.
530 */
531 if ((journal->j_fs_dev != journal->j_dev) &&
532 (journal->j_flags & JBD2_BARRIER))
533 blkdev_issue_flush(journal->j_fs_dev, NULL);
518 if (!(journal->j_flags & JBD2_ABORT)) 534 if (!(journal->j_flags & JBD2_ABORT))
519 jbd2_journal_update_superblock(journal, 1); 535 jbd2_journal_update_superblock(journal, 1);
520 return 0; 536 return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6a10238d2c63..671da7fb7ffd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -259,6 +259,7 @@ static int journal_submit_data_buffers(journal_t *journal,
259 ret = err; 259 ret = err;
260 spin_lock(&journal->j_list_lock); 260 spin_lock(&journal->j_list_lock);
261 J_ASSERT(jinode->i_transaction == commit_transaction); 261 J_ASSERT(jinode->i_transaction == commit_transaction);
262 commit_transaction->t_flushed_data_blocks = 1;
262 jinode->i_flags &= ~JI_COMMIT_RUNNING; 263 jinode->i_flags &= ~JI_COMMIT_RUNNING;
263 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 264 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
264 } 265 }
@@ -708,8 +709,17 @@ start_journal_io:
708 } 709 }
709 } 710 }
710 711
711 /* Done it all: now write the commit record asynchronously. */ 712 /*
713 * If the journal is not located on the file system device,
714 * then we must flush the file system device before we issue
715 * the commit record
716 */
717 if (commit_transaction->t_flushed_data_blocks &&
718 (journal->j_fs_dev != journal->j_dev) &&
719 (journal->j_flags & JBD2_BARRIER))
720 blkdev_issue_flush(journal->j_fs_dev, NULL);
712 721
722 /* Done it all: now write the commit record asynchronously. */
713 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 723 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
714 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 724 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
715 err = journal_submit_commit_record(journal, commit_transaction, 725 err = journal_submit_commit_record(journal, commit_transaction,
@@ -720,13 +730,6 @@ start_journal_io:
720 blkdev_issue_flush(journal->j_dev, NULL); 730 blkdev_issue_flush(journal->j_dev, NULL);
721 } 731 }
722 732
723 /*
724 * This is the right place to wait for data buffers both for ASYNC
725 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
726 * the commit block went to disk (which happens above). If commit is
727 * SYNC, we need to wait for data buffers before we start writing
728 * commit block, which happens below in such setting.
729 */
730 err = journal_finish_inode_data_buffers(journal, commit_transaction); 733 err = journal_finish_inode_data_buffers(journal, commit_transaction);
731 if (err) { 734 if (err) {
732 printk(KERN_WARNING 735 printk(KERN_WARNING
@@ -880,8 +883,7 @@ restart_loop:
880 spin_unlock(&journal->j_list_lock); 883 spin_unlock(&journal->j_list_lock);
881 bh = jh2bh(jh); 884 bh = jh2bh(jh);
882 jbd_lock_bh_state(bh); 885 jbd_lock_bh_state(bh);
883 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || 886 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
884 jh->b_transaction == journal->j_running_transaction);
885 887
886 /* 888 /*
887 * If there is undo-protected committed data against 889 * If there is undo-protected committed data against
@@ -927,12 +929,12 @@ restart_loop:
927 /* A buffer which has been freed while still being 929 /* A buffer which has been freed while still being
928 * journaled by a previous transaction may end up still 930 * journaled by a previous transaction may end up still
929 * being dirty here, but we want to avoid writing back 931 * being dirty here, but we want to avoid writing back
930 * that buffer in the future now that the last use has 932 * that buffer in the future after the "add to orphan"
931 * been committed. That's not only a performance gain, 933 * operation been committed, That's not only a performance
932 * it also stops aliasing problems if the buffer is left 934 * gain, it also stops aliasing problems if the buffer is
933 * behind for writeback and gets reallocated for another 935 * left behind for writeback and gets reallocated for another
934 * use in a different page. */ 936 * use in a different page. */
935 if (buffer_freed(bh)) { 937 if (buffer_freed(bh) && !jh->b_next_transaction) {
936 clear_buffer_freed(bh); 938 clear_buffer_freed(bh);
937 clear_buffer_jbddirty(bh); 939 clear_buffer_jbddirty(bh);
938 } 940 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 17af879e6e9e..c03d4dce4d76 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -39,6 +39,8 @@
39#include <linux/seq_file.h> 39#include <linux/seq_file.h>
40#include <linux/math64.h> 40#include <linux/math64.h>
41#include <linux/hash.h> 41#include <linux/hash.h>
42#include <linux/log2.h>
43#include <linux/vmalloc.h>
42 44
43#define CREATE_TRACE_POINTS 45#define CREATE_TRACE_POINTS
44#include <trace/events/jbd2.h> 46#include <trace/events/jbd2.h>
@@ -93,6 +95,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
93 95
94static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 96static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
95static void __journal_abort_soft (journal_t *journal, int errno); 97static void __journal_abort_soft (journal_t *journal, int errno);
98static int jbd2_journal_create_slab(size_t slab_size);
96 99
97/* 100/*
98 * Helper function used to manage commit timeouts 101 * Helper function used to manage commit timeouts
@@ -814,7 +817,7 @@ static journal_t * journal_init_common (void)
814 journal_t *journal; 817 journal_t *journal;
815 int err; 818 int err;
816 819
817 journal = kzalloc(sizeof(*journal), GFP_KERNEL|__GFP_NOFAIL); 820 journal = kzalloc(sizeof(*journal), GFP_KERNEL);
818 if (!journal) 821 if (!journal)
819 goto fail; 822 goto fail;
820 823
@@ -1248,6 +1251,13 @@ int jbd2_journal_load(journal_t *journal)
1248 } 1251 }
1249 } 1252 }
1250 1253
1254 /*
1255 * Create a slab for this blocksize
1256 */
1257 err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
1258 if (err)
1259 return err;
1260
1251 /* Let the recovery code check whether it needs to recover any 1261 /* Let the recovery code check whether it needs to recover any
1252 * data from the journal. */ 1262 * data from the journal. */
1253 if (jbd2_journal_recover(journal)) 1263 if (jbd2_journal_recover(journal))
@@ -1807,6 +1817,127 @@ size_t journal_tag_bytes(journal_t *journal)
1807} 1817}
1808 1818
1809/* 1819/*
1820 * JBD memory management
1821 *
1822 * These functions are used to allocate block-sized chunks of memory
1823 * used for making copies of buffer_head data. Very often it will be
1824 * page-sized chunks of data, but sometimes it will be in
1825 * sub-page-size chunks. (For example, 16k pages on Power systems
1826 * with a 4k block file system.) For blocks smaller than a page, we
1827 * use a SLAB allocator. There are slab caches for each block size,
1828 * which are allocated at mount time, if necessary, and we only free
1829 * (all of) the slab caches when/if the jbd2 module is unloaded. For
1830 * this reason we don't need to a mutex to protect access to
1831 * jbd2_slab[] allocating or releasing memory; only in
1832 * jbd2_journal_create_slab().
1833 */
1834#define JBD2_MAX_SLABS 8
1835static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
1836static DECLARE_MUTEX(jbd2_slab_create_sem);
1837
1838static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
1839 "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
1840 "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
1841};
1842
1843
1844static void jbd2_journal_destroy_slabs(void)
1845{
1846 int i;
1847
1848 for (i = 0; i < JBD2_MAX_SLABS; i++) {
1849 if (jbd2_slab[i])
1850 kmem_cache_destroy(jbd2_slab[i]);
1851 jbd2_slab[i] = NULL;
1852 }
1853}
1854
1855static int jbd2_journal_create_slab(size_t size)
1856{
1857 int i = order_base_2(size) - 10;
1858 size_t slab_size;
1859
1860 if (size == PAGE_SIZE)
1861 return 0;
1862
1863 if (i >= JBD2_MAX_SLABS)
1864 return -EINVAL;
1865
1866 if (unlikely(i < 0))
1867 i = 0;
1868 down(&jbd2_slab_create_sem);
1869 if (jbd2_slab[i]) {
1870 up(&jbd2_slab_create_sem);
1871 return 0; /* Already created */
1872 }
1873
1874 slab_size = 1 << (i+10);
1875 jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
1876 slab_size, 0, NULL);
1877 up(&jbd2_slab_create_sem);
1878 if (!jbd2_slab[i]) {
1879 printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
1880 return -ENOMEM;
1881 }
1882 return 0;
1883}
1884
1885static struct kmem_cache *get_slab(size_t size)
1886{
1887 int i = order_base_2(size) - 10;
1888
1889 BUG_ON(i >= JBD2_MAX_SLABS);
1890 if (unlikely(i < 0))
1891 i = 0;
1892 BUG_ON(jbd2_slab[i] == 0);
1893 return jbd2_slab[i];
1894}
1895
1896void *jbd2_alloc(size_t size, gfp_t flags)
1897{
1898 void *ptr;
1899
1900 BUG_ON(size & (size-1)); /* Must be a power of 2 */
1901
1902 flags |= __GFP_REPEAT;
1903 if (size == PAGE_SIZE)
1904 ptr = (void *)__get_free_pages(flags, 0);
1905 else if (size > PAGE_SIZE) {
1906 int order = get_order(size);
1907
1908 if (order < 3)
1909 ptr = (void *)__get_free_pages(flags, order);
1910 else
1911 ptr = vmalloc(size);
1912 } else
1913 ptr = kmem_cache_alloc(get_slab(size), flags);
1914
1915 /* Check alignment; SLUB has gotten this wrong in the past,
1916 * and this can lead to user data corruption! */
1917 BUG_ON(((unsigned long) ptr) & (size-1));
1918
1919 return ptr;
1920}
1921
1922void jbd2_free(void *ptr, size_t size)
1923{
1924 if (size == PAGE_SIZE) {
1925 free_pages((unsigned long)ptr, 0);
1926 return;
1927 }
1928 if (size > PAGE_SIZE) {
1929 int order = get_order(size);
1930
1931 if (order < 3)
1932 free_pages((unsigned long)ptr, order);
1933 else
1934 vfree(ptr);
1935 return;
1936 }
1937 kmem_cache_free(get_slab(size), ptr);
1938};
1939
1940/*
1810 * Journal_head storage management 1941 * Journal_head storage management
1811 */ 1942 */
1812static struct kmem_cache *jbd2_journal_head_cache; 1943static struct kmem_cache *jbd2_journal_head_cache;
@@ -2204,6 +2335,7 @@ static void jbd2_journal_destroy_caches(void)
2204 jbd2_journal_destroy_revoke_caches(); 2335 jbd2_journal_destroy_revoke_caches();
2205 jbd2_journal_destroy_jbd2_journal_head_cache(); 2336 jbd2_journal_destroy_jbd2_journal_head_cache();
2206 jbd2_journal_destroy_handle_cache(); 2337 jbd2_journal_destroy_handle_cache();
2338 jbd2_journal_destroy_slabs();
2207} 2339}
2208 2340
2209static int __init journal_init(void) 2341static int __init journal_init(void)
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 73063285b13f..049281b7cb89 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -20,7 +20,6 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/jbd2.h> 21#include <linux/jbd2.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/slab.h>
24#include <linux/crc32.h> 23#include <linux/crc32.h>
25#endif 24#endif
26 25
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a0512700542f..bfc70f57900f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1727,6 +1727,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1727 if (!jh) 1727 if (!jh)
1728 goto zap_buffer_no_jh; 1728 goto zap_buffer_no_jh;
1729 1729
1730 /*
1731 * We cannot remove the buffer from checkpoint lists until the
1732 * transaction adding inode to orphan list (let's call it T)
1733 * is committed. Otherwise if the transaction changing the
1734 * buffer would be cleaned from the journal before T is
1735 * committed, a crash will cause that the correct contents of
1736 * the buffer will be lost. On the other hand we have to
1737 * clear the buffer dirty bit at latest at the moment when the
1738 * transaction marking the buffer as freed in the filesystem
1739 * structures is committed because from that moment on the
1740 * buffer can be reallocated and used by a different page.
1741 * Since the block hasn't been freed yet but the inode has
1742 * already been added to orphan list, it is safe for us to add
1743 * the buffer to BJ_Forget list of the newest transaction.
1744 */
1730 transaction = jh->b_transaction; 1745 transaction = jh->b_transaction;
1731 if (transaction == NULL) { 1746 if (transaction == NULL) {
1732 /* First case: not on any transaction. If it 1747 /* First case: not on any transaction. If it
@@ -1783,16 +1798,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1783 } else if (transaction == journal->j_committing_transaction) { 1798 } else if (transaction == journal->j_committing_transaction) {
1784 JBUFFER_TRACE(jh, "on committing transaction"); 1799 JBUFFER_TRACE(jh, "on committing transaction");
1785 /* 1800 /*
1786 * If it is committing, we simply cannot touch it. We 1801 * The buffer is committing, we simply cannot touch
1787 * can remove it's next_transaction pointer from the 1802 * it. So we just set j_next_transaction to the
1788 * running transaction if that is set, but nothing 1803 * running transaction (if there is one) and mark
1789 * else. */ 1804 * buffer as freed so that commit code knows it should
1805 * clear dirty bits when it is done with the buffer.
1806 */
1790 set_buffer_freed(bh); 1807 set_buffer_freed(bh);
1791 if (jh->b_next_transaction) { 1808 if (journal->j_running_transaction && buffer_jbddirty(bh))
1792 J_ASSERT(jh->b_next_transaction == 1809 jh->b_next_transaction = journal->j_running_transaction;
1793 journal->j_running_transaction);
1794 jh->b_next_transaction = NULL;
1795 }
1796 jbd2_journal_put_journal_head(jh); 1810 jbd2_journal_put_journal_head(jh);
1797 spin_unlock(&journal->j_list_lock); 1811 spin_unlock(&journal->j_list_lock);
1798 jbd_unlock_bh_state(bh); 1812 jbd_unlock_bh_state(bh);
@@ -1969,7 +1983,7 @@ void jbd2_journal_file_buffer(struct journal_head *jh,
1969 */ 1983 */
1970void __jbd2_journal_refile_buffer(struct journal_head *jh) 1984void __jbd2_journal_refile_buffer(struct journal_head *jh)
1971{ 1985{
1972 int was_dirty; 1986 int was_dirty, jlist;
1973 struct buffer_head *bh = jh2bh(jh); 1987 struct buffer_head *bh = jh2bh(jh);
1974 1988
1975 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); 1989 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -1991,8 +2005,13 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
1991 __jbd2_journal_temp_unlink_buffer(jh); 2005 __jbd2_journal_temp_unlink_buffer(jh);
1992 jh->b_transaction = jh->b_next_transaction; 2006 jh->b_transaction = jh->b_next_transaction;
1993 jh->b_next_transaction = NULL; 2007 jh->b_next_transaction = NULL;
1994 __jbd2_journal_file_buffer(jh, jh->b_transaction, 2008 if (buffer_freed(bh))
1995 jh->b_modified ? BJ_Metadata : BJ_Reserved); 2009 jlist = BJ_Forget;
2010 else if (jh->b_modified)
2011 jlist = BJ_Metadata;
2012 else
2013 jlist = BJ_Reserved;
2014 __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
1996 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); 2015 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
1997 2016
1998 if (was_dirty) 2017 if (was_dirty)
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index 90cb60d09787..cd02acafde8a 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -11,7 +11,6 @@
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/slab.h>
15#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/lzo.h> 16#include <linux/lzo.h>
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index cfd301a5edfc..b46661a42758 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -14,7 +14,6 @@
14#endif 14#endif
15 15
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h>
18#include <linux/zlib.h> 17#include <linux/zlib.h>
19#include <linux/zutil.h> 18#include <linux/zutil.h>
20#include "nodelist.h" 19#include "nodelist.h"
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 5544d31c066b..ec3538413926 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -15,6 +15,7 @@
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/jffs2.h> 16#include <linux/jffs2.h>
17#include <linux/mtd/mtd.h> 17#include <linux/mtd/mtd.h>
18#include <linux/slab.h>
18#include "nodelist.h" 19#include "nodelist.h"
19#include "debug.h" 20#include "debug.h"
20 21
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index b7b74e299142..e7291c161a19 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/time.h> 14#include <linux/time.h>
16#include <linux/pagemap.h> 15#include <linux/pagemap.h>
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 87c6f555e1a0..af02bd138469 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -15,7 +15,6 @@
15#include <linux/mtd/mtd.h> 15#include <linux/mtd/mtd.h>
16#include <linux/rbtree.h> 16#include <linux/rbtree.h>
17#include <linux/crc32.h> 17#include <linux/crc32.h>
18#include <linux/slab.h>
19#include <linux/pagemap.h> 18#include <linux/pagemap.h>
20#include "nodelist.h" 19#include "nodelist.h"
21 20
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 21a052915aa9..191359dde4e1 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/mtd/mtd.h> 13#include <linux/mtd/mtd.h>
15#include <linux/compiler.h> 14#include <linux/compiler.h>
16#include <linux/sched.h> /* For cond_resched() */ 15#include <linux/sched.h> /* For cond_resched() */
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index e22de8397b74..d32ee9412cb9 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -567,7 +567,7 @@ static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
567 else BUG(); 567 else BUG();
568 } 568 }
569 } 569 }
570 list->rb_node = NULL; 570 *list = RB_ROOT;
571} 571}
572 572
573static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd) 573static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd)
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 4ec11e8bda8c..b955626071c2 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/fs.h> 13#include <linux/fs.h>
15#include <linux/namei.h> 14#include <linux/namei.h>
16#include "nodelist.h" 15#include "nodelist.h"
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index ca29440e9435..c819eb0e982d 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -12,7 +12,6 @@
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/crc32.h> 14#include <linux/crc32.h>
15#include <linux/slab.h>
16#include <linux/pagemap.h> 15#include <linux/pagemap.h>
17#include <linux/mtd/mtd.h> 16#include <linux/mtd/mtd.h>
18#include "nodelist.h" 17#include "nodelist.h"
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index d66477c34306..1057a4998e4e 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -19,8 +19,8 @@
19 */ 19 */
20 20
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/slab.h>
22#include <linux/fs.h> 23#include <linux/fs.h>
23#include <linux/quotaops.h>
24#include <linux/posix_acl_xattr.h> 24#include <linux/posix_acl_xattr.h>
25#include "jfs_incore.h" 25#include "jfs_incore.h"
26#include "jfs_txnmgr.h" 26#include "jfs_txnmgr.h"
@@ -174,7 +174,7 @@ cleanup:
174 return rc; 174 return rc;
175} 175}
176 176
177static int jfs_acl_chmod(struct inode *inode) 177int jfs_acl_chmod(struct inode *inode)
178{ 178{
179 struct posix_acl *acl, *clone; 179 struct posix_acl *acl, *clone;
180 int rc; 180 int rc;
@@ -205,26 +205,3 @@ static int jfs_acl_chmod(struct inode *inode)
205 posix_acl_release(clone); 205 posix_acl_release(clone);
206 return rc; 206 return rc;
207} 207}
208
209int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
210{
211 struct inode *inode = dentry->d_inode;
212 int rc;
213
214 rc = inode_change_ok(inode, iattr);
215 if (rc)
216 return rc;
217
218 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
219 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
220 if (vfs_dq_transfer(inode, iattr))
221 return -EDQUOT;
222 }
223
224 rc = inode_setattr(inode, iattr);
225
226 if (!rc && (iattr->ia_valid & ATTR_MODE))
227 rc = jfs_acl_chmod(inode);
228
229 return rc;
230}
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 2b70fa78e4a7..14ba982b3f24 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/quotaops.h>
21#include "jfs_incore.h" 22#include "jfs_incore.h"
22#include "jfs_inode.h" 23#include "jfs_inode.h"
23#include "jfs_dmap.h" 24#include "jfs_dmap.h"
@@ -47,7 +48,7 @@ static int jfs_open(struct inode *inode, struct file *file)
47{ 48{
48 int rc; 49 int rc;
49 50
50 if ((rc = generic_file_open(inode, file))) 51 if ((rc = dquot_file_open(inode, file)))
51 return rc; 52 return rc;
52 53
53 /* 54 /*
@@ -88,14 +89,40 @@ static int jfs_release(struct inode *inode, struct file *file)
88 return 0; 89 return 0;
89} 90}
90 91
92int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
93{
94 struct inode *inode = dentry->d_inode;
95 int rc;
96
97 rc = inode_change_ok(inode, iattr);
98 if (rc)
99 return rc;
100
101 if (iattr->ia_valid & ATTR_SIZE)
102 dquot_initialize(inode);
103 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
104 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
105 rc = dquot_transfer(inode, iattr);
106 if (rc)
107 return rc;
108 }
109
110 rc = inode_setattr(inode, iattr);
111
112 if (!rc && (iattr->ia_valid & ATTR_MODE))
113 rc = jfs_acl_chmod(inode);
114
115 return rc;
116}
117
91const struct inode_operations jfs_file_inode_operations = { 118const struct inode_operations jfs_file_inode_operations = {
92 .truncate = jfs_truncate, 119 .truncate = jfs_truncate,
93 .setxattr = jfs_setxattr, 120 .setxattr = jfs_setxattr,
94 .getxattr = jfs_getxattr, 121 .getxattr = jfs_getxattr,
95 .listxattr = jfs_listxattr, 122 .listxattr = jfs_listxattr,
96 .removexattr = jfs_removexattr, 123 .removexattr = jfs_removexattr,
97#ifdef CONFIG_JFS_POSIX_ACL
98 .setattr = jfs_setattr, 124 .setattr = jfs_setattr,
125#ifdef CONFIG_JFS_POSIX_ACL
99 .check_acl = jfs_check_acl, 126 .check_acl = jfs_check_acl,
100#endif 127#endif
101}; 128};
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b2ae190a77ba..ed9ba6fe04f5 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -22,6 +22,7 @@
22#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/pagemap.h> 23#include <linux/pagemap.h>
24#include <linux/quotaops.h> 24#include <linux/quotaops.h>
25#include <linux/writeback.h>
25#include "jfs_incore.h" 26#include "jfs_incore.h"
26#include "jfs_inode.h" 27#include "jfs_inode.h"
27#include "jfs_filsys.h" 28#include "jfs_filsys.h"
@@ -60,7 +61,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
60 inode->i_op = &page_symlink_inode_operations; 61 inode->i_op = &page_symlink_inode_operations;
61 inode->i_mapping->a_ops = &jfs_aops; 62 inode->i_mapping->a_ops = &jfs_aops;
62 } else { 63 } else {
63 inode->i_op = &jfs_symlink_inode_operations; 64 inode->i_op = &jfs_fast_symlink_inode_operations;
64 /* 65 /*
65 * The inline data should be null-terminated, but 66 * The inline data should be null-terminated, but
66 * don't let on-disk corruption crash the kernel 67 * don't let on-disk corruption crash the kernel
@@ -120,8 +121,10 @@ int jfs_commit_inode(struct inode *inode, int wait)
120 return rc; 121 return rc;
121} 122}
122 123
123int jfs_write_inode(struct inode *inode, int wait) 124int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
124{ 125{
126 int wait = wbc->sync_mode == WB_SYNC_ALL;
127
125 if (test_cflag(COMMIT_Nolink, inode)) 128 if (test_cflag(COMMIT_Nolink, inode))
126 return 0; 129 return 0;
127 /* 130 /*
@@ -146,6 +149,9 @@ void jfs_delete_inode(struct inode *inode)
146{ 149{
147 jfs_info("In jfs_delete_inode, inode = 0x%p", inode); 150 jfs_info("In jfs_delete_inode, inode = 0x%p", inode);
148 151
152 if (!is_bad_inode(inode))
153 dquot_initialize(inode);
154
149 if (!is_bad_inode(inode) && 155 if (!is_bad_inode(inode) &&
150 (JFS_IP(inode)->fileset == FILESYSTEM_I)) { 156 (JFS_IP(inode)->fileset == FILESYSTEM_I)) {
151 truncate_inode_pages(&inode->i_data, 0); 157 truncate_inode_pages(&inode->i_data, 0);
@@ -158,9 +164,9 @@ void jfs_delete_inode(struct inode *inode)
158 /* 164 /*
159 * Free the inode from the quota allocation. 165 * Free the inode from the quota allocation.
160 */ 166 */
161 vfs_dq_init(inode); 167 dquot_initialize(inode);
162 vfs_dq_free_inode(inode); 168 dquot_free_inode(inode);
163 vfs_dq_drop(inode); 169 dquot_drop(inode);
164 } 170 }
165 171
166 clear_inode(inode); 172 clear_inode(inode);
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index b07bd417ef85..54e07559878d 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -22,7 +22,7 @@
22 22
23int jfs_check_acl(struct inode *, int); 23int jfs_check_acl(struct inode *, int);
24int jfs_init_acl(tid_t, struct inode *, struct inode *); 24int jfs_init_acl(tid_t, struct inode *, struct inode *);
25int jfs_setattr(struct dentry *, struct iattr *); 25int jfs_acl_chmod(struct inode *inode);
26 26
27#else 27#else
28 28
@@ -32,5 +32,10 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode,
32 return 0; 32 return 0;
33} 33}
34 34
35static inline int jfs_acl_chmod(struct inode *inode)
36{
37 return 0;
38}
39
35#endif 40#endif
36#endif /* _H_JFS_ACL */ 41#endif /* _H_JFS_ACL */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index d9b031cf69f5..9e2f6a721668 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20#include "jfs_incore.h" 21#include "jfs_incore.h"
21#include "jfs_superblock.h" 22#include "jfs_superblock.h"
22#include "jfs_dmap.h" 23#include "jfs_dmap.h"
@@ -195,7 +196,7 @@ int dbMount(struct inode *ipbmap)
195 bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); 196 bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
196 bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); 197 bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
197 bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); 198 bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
198 bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth); 199 bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
199 bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); 200 bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
200 bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); 201 bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
201 bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); 202 bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
@@ -287,7 +288,7 @@ int dbSync(struct inode *ipbmap)
287 dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag); 288 dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
288 dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref); 289 dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
289 dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel); 290 dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
290 dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth); 291 dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight);
291 dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth); 292 dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
292 dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart); 293 dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
293 dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size); 294 dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
@@ -1440,7 +1441,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1440 * tree index of this allocation group within the control page. 1441 * tree index of this allocation group within the control page.
1441 */ 1442 */
1442 agperlev = 1443 agperlev =
1443 (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth; 1444 (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth;
1444 ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1)); 1445 ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
1445 1446
1446 /* dmap control page trees fan-out by 4 and a single allocation 1447 /* dmap control page trees fan-out by 4 and a single allocation
@@ -1459,7 +1460,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1459 * the subtree to find the leftmost leaf that describes this 1460 * the subtree to find the leftmost leaf that describes this
1460 * free space. 1461 * free space.
1461 */ 1462 */
1462 for (k = bmp->db_agheigth; k > 0; k--) { 1463 for (k = bmp->db_agheight; k > 0; k--) {
1463 for (n = 0, m = (ti << 2) + 1; n < 4; n++) { 1464 for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
1464 if (l2nb <= dcp->stree[m + n]) { 1465 if (l2nb <= dcp->stree[m + n]) {
1465 ti = m + n; 1466 ti = m + n;
@@ -3606,7 +3607,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
3606 } 3607 }
3607 3608
3608 /* 3609 /*
3609 * compute db_aglevel, db_agheigth, db_width, db_agstart: 3610 * compute db_aglevel, db_agheight, db_width, db_agstart:
3610 * an ag is covered in aglevel dmapctl summary tree, 3611 * an ag is covered in aglevel dmapctl summary tree,
3611 * at agheight level height (from leaf) with agwidth number of nodes 3612 * at agheight level height (from leaf) with agwidth number of nodes
3612 * each, which starts at agstart index node of the smmary tree node 3613 * each, which starts at agstart index node of the smmary tree node
@@ -3615,9 +3616,9 @@ void dbFinalizeBmap(struct inode *ipbmap)
3615 bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize); 3616 bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
3616 l2nl = 3617 l2nl =
3617 bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL); 3618 bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
3618 bmp->db_agheigth = l2nl >> 1; 3619 bmp->db_agheight = l2nl >> 1;
3619 bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1)); 3620 bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1));
3620 for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0; 3621 for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0;
3621 i--) { 3622 i--) {
3622 bmp->db_agstart += n; 3623 bmp->db_agstart += n;
3623 n <<= 2; 3624 n <<= 2;
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 1a6eb41569bc..6dcb906c55d8 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -210,7 +210,7 @@ struct dbmap_disk {
210 __le32 dn_maxag; /* 4: max active alloc group number */ 210 __le32 dn_maxag; /* 4: max active alloc group number */
211 __le32 dn_agpref; /* 4: preferred alloc group (hint) */ 211 __le32 dn_agpref; /* 4: preferred alloc group (hint) */
212 __le32 dn_aglevel; /* 4: dmapctl level holding the AG */ 212 __le32 dn_aglevel; /* 4: dmapctl level holding the AG */
213 __le32 dn_agheigth; /* 4: height in dmapctl of the AG */ 213 __le32 dn_agheight; /* 4: height in dmapctl of the AG */
214 __le32 dn_agwidth; /* 4: width in dmapctl of the AG */ 214 __le32 dn_agwidth; /* 4: width in dmapctl of the AG */
215 __le32 dn_agstart; /* 4: start tree index at AG height */ 215 __le32 dn_agstart; /* 4: start tree index at AG height */
216 __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */ 216 __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */
@@ -229,7 +229,7 @@ struct dbmap {
229 int dn_maxag; /* max active alloc group number */ 229 int dn_maxag; /* max active alloc group number */
230 int dn_agpref; /* preferred alloc group (hint) */ 230 int dn_agpref; /* preferred alloc group (hint) */
231 int dn_aglevel; /* dmapctl level holding the AG */ 231 int dn_aglevel; /* dmapctl level holding the AG */
232 int dn_agheigth; /* height in dmapctl of the AG */ 232 int dn_agheight; /* height in dmapctl of the AG */
233 int dn_agwidth; /* width in dmapctl of the AG */ 233 int dn_agwidth; /* width in dmapctl of the AG */
234 int dn_agstart; /* start tree index at AG height */ 234 int dn_agstart; /* start tree index at AG height */
235 int dn_agl2size; /* l2 num of blks per alloc group */ 235 int dn_agl2size; /* l2 num of blks per alloc group */
@@ -255,7 +255,7 @@ struct bmap {
255#define db_agsize db_bmap.dn_agsize 255#define db_agsize db_bmap.dn_agsize
256#define db_agl2size db_bmap.dn_agl2size 256#define db_agl2size db_bmap.dn_agl2size
257#define db_agwidth db_bmap.dn_agwidth 257#define db_agwidth db_bmap.dn_agwidth
258#define db_agheigth db_bmap.dn_agheigth 258#define db_agheight db_bmap.dn_agheight
259#define db_agstart db_bmap.dn_agstart 259#define db_agstart db_bmap.dn_agstart
260#define db_numag db_bmap.dn_numag 260#define db_numag db_bmap.dn_numag
261#define db_maxlevel db_bmap.dn_maxlevel 261#define db_maxlevel db_bmap.dn_maxlevel
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 925871e9887b..9197a1b0d02d 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -102,6 +102,7 @@
102 102
103#include <linux/fs.h> 103#include <linux/fs.h>
104#include <linux/quotaops.h> 104#include <linux/quotaops.h>
105#include <linux/slab.h>
105#include "jfs_incore.h" 106#include "jfs_incore.h"
106#include "jfs_superblock.h" 107#include "jfs_superblock.h"
107#include "jfs_filsys.h" 108#include "jfs_filsys.h"
@@ -381,10 +382,10 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
381 * It's time to move the inline table to an external 382 * It's time to move the inline table to an external
382 * page and begin to build the xtree 383 * page and begin to build the xtree
383 */ 384 */
384 if (vfs_dq_alloc_block(ip, sbi->nbperpage)) 385 if (dquot_alloc_block(ip, sbi->nbperpage))
385 goto clean_up; 386 goto clean_up;
386 if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) { 387 if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
387 vfs_dq_free_block(ip, sbi->nbperpage); 388 dquot_free_block(ip, sbi->nbperpage);
388 goto clean_up; 389 goto clean_up;
389 } 390 }
390 391
@@ -408,7 +409,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
408 memcpy(&jfs_ip->i_dirtable, temp_table, 409 memcpy(&jfs_ip->i_dirtable, temp_table,
409 sizeof (temp_table)); 410 sizeof (temp_table));
410 dbFree(ip, xaddr, sbi->nbperpage); 411 dbFree(ip, xaddr, sbi->nbperpage);
411 vfs_dq_free_block(ip, sbi->nbperpage); 412 dquot_free_block(ip, sbi->nbperpage);
412 goto clean_up; 413 goto clean_up;
413 } 414 }
414 ip->i_size = PSIZE; 415 ip->i_size = PSIZE;
@@ -1027,10 +1028,9 @@ static int dtSplitUp(tid_t tid,
1027 n = xlen; 1028 n = xlen;
1028 1029
1029 /* Allocate blocks to quota. */ 1030 /* Allocate blocks to quota. */
1030 if (vfs_dq_alloc_block(ip, n)) { 1031 rc = dquot_alloc_block(ip, n);
1031 rc = -EDQUOT; 1032 if (rc)
1032 goto extendOut; 1033 goto extendOut;
1033 }
1034 quota_allocation += n; 1034 quota_allocation += n;
1035 1035
1036 if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, 1036 if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
@@ -1308,7 +1308,7 @@ static int dtSplitUp(tid_t tid,
1308 1308
1309 /* Rollback quota allocation */ 1309 /* Rollback quota allocation */
1310 if (rc && quota_allocation) 1310 if (rc && quota_allocation)
1311 vfs_dq_free_block(ip, quota_allocation); 1311 dquot_free_block(ip, quota_allocation);
1312 1312
1313 dtSplitUp_Exit: 1313 dtSplitUp_Exit:
1314 1314
@@ -1369,9 +1369,10 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1369 return -EIO; 1369 return -EIO;
1370 1370
1371 /* Allocate blocks to quota. */ 1371 /* Allocate blocks to quota. */
1372 if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { 1372 rc = dquot_alloc_block(ip, lengthPXD(pxd));
1373 if (rc) {
1373 release_metapage(rmp); 1374 release_metapage(rmp);
1374 return -EDQUOT; 1375 return rc;
1375 } 1376 }
1376 1377
1377 jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); 1378 jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
@@ -1892,6 +1893,7 @@ static int dtSplitRoot(tid_t tid,
1892 struct dt_lock *dtlck; 1893 struct dt_lock *dtlck;
1893 struct tlock *tlck; 1894 struct tlock *tlck;
1894 struct lv *lv; 1895 struct lv *lv;
1896 int rc;
1895 1897
1896 /* get split root page */ 1898 /* get split root page */
1897 smp = split->mp; 1899 smp = split->mp;
@@ -1916,9 +1918,10 @@ static int dtSplitRoot(tid_t tid,
1916 rp = rmp->data; 1918 rp = rmp->data;
1917 1919
1918 /* Allocate blocks to quota. */ 1920 /* Allocate blocks to quota. */
1919 if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { 1921 rc = dquot_alloc_block(ip, lengthPXD(pxd));
1922 if (rc) {
1920 release_metapage(rmp); 1923 release_metapage(rmp);
1921 return -EDQUOT; 1924 return rc;
1922 } 1925 }
1923 1926
1924 BT_MARK_DIRTY(rmp, ip); 1927 BT_MARK_DIRTY(rmp, ip);
@@ -2287,7 +2290,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
2287 xlen = lengthPXD(&fp->header.self); 2290 xlen = lengthPXD(&fp->header.self);
2288 2291
2289 /* Free quota allocation. */ 2292 /* Free quota allocation. */
2290 vfs_dq_free_block(ip, xlen); 2293 dquot_free_block(ip, xlen);
2291 2294
2292 /* free/invalidate its buffer page */ 2295 /* free/invalidate its buffer page */
2293 discard_metapage(fmp); 2296 discard_metapage(fmp);
@@ -2363,7 +2366,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
2363 xlen = lengthPXD(&p->header.self); 2366 xlen = lengthPXD(&p->header.self);
2364 2367
2365 /* Free quota allocation */ 2368 /* Free quota allocation */
2366 vfs_dq_free_block(ip, xlen); 2369 dquot_free_block(ip, xlen);
2367 2370
2368 /* free/invalidate its buffer page */ 2371 /* free/invalidate its buffer page */
2369 discard_metapage(mp); 2372 discard_metapage(mp);
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 41d6045dbeb0..5d3bbd10f8db 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -141,10 +141,11 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
141 } 141 }
142 142
143 /* Allocate blocks to quota. */ 143 /* Allocate blocks to quota. */
144 if (vfs_dq_alloc_block(ip, nxlen)) { 144 rc = dquot_alloc_block(ip, nxlen);
145 if (rc) {
145 dbFree(ip, nxaddr, (s64) nxlen); 146 dbFree(ip, nxaddr, (s64) nxlen);
146 mutex_unlock(&JFS_IP(ip)->commit_mutex); 147 mutex_unlock(&JFS_IP(ip)->commit_mutex);
147 return -EDQUOT; 148 return rc;
148 } 149 }
149 150
150 /* determine the value of the extent flag */ 151 /* determine the value of the extent flag */
@@ -164,7 +165,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
164 */ 165 */
165 if (rc) { 166 if (rc) {
166 dbFree(ip, nxaddr, nxlen); 167 dbFree(ip, nxaddr, nxlen);
167 vfs_dq_free_block(ip, nxlen); 168 dquot_free_block(ip, nxlen);
168 mutex_unlock(&JFS_IP(ip)->commit_mutex); 169 mutex_unlock(&JFS_IP(ip)->commit_mutex);
169 return (rc); 170 return (rc);
170 } 171 }
@@ -256,10 +257,11 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
256 goto exit; 257 goto exit;
257 258
258 /* Allocat blocks to quota. */ 259 /* Allocat blocks to quota. */
259 if (vfs_dq_alloc_block(ip, nxlen)) { 260 rc = dquot_alloc_block(ip, nxlen);
261 if (rc) {
260 dbFree(ip, nxaddr, (s64) nxlen); 262 dbFree(ip, nxaddr, (s64) nxlen);
261 mutex_unlock(&JFS_IP(ip)->commit_mutex); 263 mutex_unlock(&JFS_IP(ip)->commit_mutex);
262 return -EDQUOT; 264 return rc;
263 } 265 }
264 266
265 delta = nxlen - xlen; 267 delta = nxlen - xlen;
@@ -297,7 +299,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
297 /* extend the extent */ 299 /* extend the extent */
298 if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) { 300 if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
299 dbFree(ip, xaddr + xlen, delta); 301 dbFree(ip, xaddr + xlen, delta);
300 vfs_dq_free_block(ip, nxlen); 302 dquot_free_block(ip, nxlen);
301 goto exit; 303 goto exit;
302 } 304 }
303 } else { 305 } else {
@@ -308,7 +310,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
308 */ 310 */
309 if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) { 311 if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
310 dbFree(ip, nxaddr, nxlen); 312 dbFree(ip, nxaddr, nxlen);
311 vfs_dq_free_block(ip, nxlen); 313 dquot_free_block(ip, nxlen);
312 goto exit; 314 goto exit;
313 } 315 }
314 } 316 }
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 0fc30407f039..f8332dc8eeb2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -45,6 +45,7 @@
45#include <linux/buffer_head.h> 45#include <linux/buffer_head.h>
46#include <linux/pagemap.h> 46#include <linux/pagemap.h>
47#include <linux/quotaops.h> 47#include <linux/quotaops.h>
48#include <linux/slab.h>
48 49
49#include "jfs_incore.h" 50#include "jfs_incore.h"
50#include "jfs_inode.h" 51#include "jfs_inode.h"
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index dc0e02159ac9..829921b67765 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -116,10 +116,10 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
116 /* 116 /*
117 * Allocate inode to quota. 117 * Allocate inode to quota.
118 */ 118 */
119 if (vfs_dq_alloc_inode(inode)) { 119 dquot_initialize(inode);
120 rc = -EDQUOT; 120 rc = dquot_alloc_inode(inode);
121 if (rc)
121 goto fail_drop; 122 goto fail_drop;
122 }
123 123
124 inode->i_mode = mode; 124 inode->i_mode = mode;
125 /* inherit flags from parent */ 125 /* inherit flags from parent */
@@ -162,7 +162,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
162 return inode; 162 return inode;
163 163
164fail_drop: 164fail_drop:
165 vfs_dq_drop(inode); 165 dquot_drop(inode);
166 inode->i_flags |= S_NOQUOTA; 166 inode->i_flags |= S_NOQUOTA;
167fail_unlock: 167fail_unlock:
168 inode->i_nlink = 0; 168 inode->i_nlink = 0;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 1eff7db34d63..9e6bda30a6e8 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -26,7 +26,7 @@ extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); 26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
27extern struct inode *jfs_iget(struct super_block *, unsigned long); 27extern struct inode *jfs_iget(struct super_block *, unsigned long);
28extern int jfs_commit_inode(struct inode *, int); 28extern int jfs_commit_inode(struct inode *, int);
29extern int jfs_write_inode(struct inode*, int); 29extern int jfs_write_inode(struct inode *, struct writeback_control *);
30extern void jfs_delete_inode(struct inode *); 30extern void jfs_delete_inode(struct inode *);
31extern void jfs_dirty_inode(struct inode *); 31extern void jfs_dirty_inode(struct inode *);
32extern void jfs_truncate(struct inode *); 32extern void jfs_truncate(struct inode *);
@@ -40,6 +40,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
40 int fh_len, int fh_type); 40 int fh_len, int fh_type);
41extern void jfs_set_inode_flags(struct inode *); 41extern void jfs_set_inode_flags(struct inode *);
42extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); 42extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
43extern int jfs_setattr(struct dentry *, struct iattr *);
43 44
44extern const struct address_space_operations jfs_aops; 45extern const struct address_space_operations jfs_aops;
45extern const struct inode_operations jfs_dir_inode_operations; 46extern const struct inode_operations jfs_dir_inode_operations;
@@ -47,5 +48,6 @@ extern const struct file_operations jfs_dir_operations;
47extern const struct inode_operations jfs_file_inode_operations; 48extern const struct inode_operations jfs_file_inode_operations;
48extern const struct file_operations jfs_file_operations; 49extern const struct file_operations jfs_file_operations;
49extern const struct inode_operations jfs_symlink_inode_operations; 50extern const struct inode_operations jfs_symlink_inode_operations;
51extern const struct inode_operations jfs_fast_symlink_inode_operations;
50extern const struct dentry_operations jfs_ci_dentry_operations; 52extern const struct dentry_operations jfs_ci_dentry_operations;
51#endif /* _H_JFS_INODE */ 53#endif /* _H_JFS_INODE */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 335c4de6552d..c51af2a14516 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -70,6 +70,7 @@
70#include <linux/delay.h> 70#include <linux/delay.h>
71#include <linux/mutex.h> 71#include <linux/mutex.h>
72#include <linux/seq_file.h> 72#include <linux/seq_file.h>
73#include <linux/slab.h>
73#include "jfs_incore.h" 74#include "jfs_incore.h"
74#include "jfs_filsys.h" 75#include "jfs_filsys.h"
75#include "jfs_metapage.h" 76#include "jfs_metapage.h"
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 07b6c5dfb4b6..48b44bd8267b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -21,6 +21,7 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/bio.h> 23#include <linux/bio.h>
24#include <linux/slab.h>
24#include <linux/init.h> 25#include <linux/init.h>
25#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
26#include <linux/mempool.h> 27#include <linux/mempool.h>
diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h
index 3fbb3a225590..8f0f02cb6ca6 100644
--- a/fs/jfs/jfs_unicode.h
+++ b/fs/jfs/jfs_unicode.h
@@ -19,6 +19,7 @@
19#ifndef _H_JFS_UNICODE 19#ifndef _H_JFS_UNICODE
20#define _H_JFS_UNICODE 20#define _H_JFS_UNICODE
21 21
22#include <linux/slab.h>
22#include <asm/byteorder.h> 23#include <asm/byteorder.h>
23#include "jfs_types.h" 24#include "jfs_types.h"
24 25
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index d654a6458648..6c50871e6220 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -585,10 +585,10 @@ int xtInsert(tid_t tid, /* transaction id */
585 hint = addressXAD(xad) + lengthXAD(xad) - 1; 585 hint = addressXAD(xad) + lengthXAD(xad) - 1;
586 } else 586 } else
587 hint = 0; 587 hint = 0;
588 if ((rc = vfs_dq_alloc_block(ip, xlen))) 588 if ((rc = dquot_alloc_block(ip, xlen)))
589 goto out; 589 goto out;
590 if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) { 590 if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
591 vfs_dq_free_block(ip, xlen); 591 dquot_free_block(ip, xlen);
592 goto out; 592 goto out;
593 } 593 }
594 } 594 }
@@ -617,7 +617,7 @@ int xtInsert(tid_t tid, /* transaction id */
617 /* undo data extent allocation */ 617 /* undo data extent allocation */
618 if (*xaddrp == 0) { 618 if (*xaddrp == 0) {
619 dbFree(ip, xaddr, (s64) xlen); 619 dbFree(ip, xaddr, (s64) xlen);
620 vfs_dq_free_block(ip, xlen); 620 dquot_free_block(ip, xlen);
621 } 621 }
622 return rc; 622 return rc;
623 } 623 }
@@ -985,10 +985,9 @@ xtSplitPage(tid_t tid, struct inode *ip,
985 rbn = addressPXD(pxd); 985 rbn = addressPXD(pxd);
986 986
987 /* Allocate blocks to quota. */ 987 /* Allocate blocks to quota. */
988 if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { 988 rc = dquot_alloc_block(ip, lengthPXD(pxd));
989 rc = -EDQUOT; 989 if (rc)
990 goto clean_up; 990 goto clean_up;
991 }
992 991
993 quota_allocation += lengthPXD(pxd); 992 quota_allocation += lengthPXD(pxd);
994 993
@@ -1195,7 +1194,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
1195 1194
1196 /* Rollback quota allocation. */ 1195 /* Rollback quota allocation. */
1197 if (quota_allocation) 1196 if (quota_allocation)
1198 vfs_dq_free_block(ip, quota_allocation); 1197 dquot_free_block(ip, quota_allocation);
1199 1198
1200 return (rc); 1199 return (rc);
1201} 1200}
@@ -1235,6 +1234,7 @@ xtSplitRoot(tid_t tid,
1235 struct pxdlist *pxdlist; 1234 struct pxdlist *pxdlist;
1236 struct tlock *tlck; 1235 struct tlock *tlck;
1237 struct xtlock *xtlck; 1236 struct xtlock *xtlck;
1237 int rc;
1238 1238
1239 sp = &JFS_IP(ip)->i_xtroot; 1239 sp = &JFS_IP(ip)->i_xtroot;
1240 1240
@@ -1252,9 +1252,10 @@ xtSplitRoot(tid_t tid,
1252 return -EIO; 1252 return -EIO;
1253 1253
1254 /* Allocate blocks to quota. */ 1254 /* Allocate blocks to quota. */
1255 if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { 1255 rc = dquot_alloc_block(ip, lengthPXD(pxd));
1256 if (rc) {
1256 release_metapage(rmp); 1257 release_metapage(rmp);
1257 return -EDQUOT; 1258 return rc;
1258 } 1259 }
1259 1260
1260 jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp); 1261 jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp);
@@ -3680,7 +3681,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3680 ip->i_size = newsize; 3681 ip->i_size = newsize;
3681 3682
3682 /* update quota allocation to reflect freed blocks */ 3683 /* update quota allocation to reflect freed blocks */
3683 vfs_dq_free_block(ip, nfreed); 3684 dquot_free_block(ip, nfreed);
3684 3685
3685 /* 3686 /*
3686 * free tlock of invalidated pages 3687 * free tlock of invalidated pages
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index c79a4270f083..a9cf8e8675be 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -85,6 +85,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
85 85
86 jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name); 86 jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name);
87 87
88 dquot_initialize(dip);
89
88 /* 90 /*
89 * search parent directory for entry/freespace 91 * search parent directory for entry/freespace
90 * (dtSearch() returns parent directory page pinned) 92 * (dtSearch() returns parent directory page pinned)
@@ -215,6 +217,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
215 217
216 jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name); 218 jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name);
217 219
220 dquot_initialize(dip);
221
218 /* link count overflow on parent directory ? */ 222 /* link count overflow on parent directory ? */
219 if (dip->i_nlink == JFS_LINK_MAX) { 223 if (dip->i_nlink == JFS_LINK_MAX) {
220 rc = -EMLINK; 224 rc = -EMLINK;
@@ -356,7 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
356 jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); 360 jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
357 361
358 /* Init inode for quota operations. */ 362 /* Init inode for quota operations. */
359 vfs_dq_init(ip); 363 dquot_initialize(dip);
364 dquot_initialize(ip);
360 365
361 /* directory must be empty to be removed */ 366 /* directory must be empty to be removed */
362 if (!dtEmpty(ip)) { 367 if (!dtEmpty(ip)) {
@@ -483,7 +488,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
483 jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name); 488 jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name);
484 489
485 /* Init inode for quota operations. */ 490 /* Init inode for quota operations. */
486 vfs_dq_init(ip); 491 dquot_initialize(dip);
492 dquot_initialize(ip);
487 493
488 if ((rc = get_UCSname(&dname, dentry))) 494 if ((rc = get_UCSname(&dname, dentry)))
489 goto out; 495 goto out;
@@ -805,6 +811,8 @@ static int jfs_link(struct dentry *old_dentry,
805 if (ip->i_nlink == 0) 811 if (ip->i_nlink == 0)
806 return -ENOENT; 812 return -ENOENT;
807 813
814 dquot_initialize(dir);
815
808 tid = txBegin(ip->i_sb, 0); 816 tid = txBegin(ip->i_sb, 0);
809 817
810 mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); 818 mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
@@ -896,6 +904,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
896 904
897 jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name); 905 jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name);
898 906
907 dquot_initialize(dip);
908
899 ssize = strlen(name) + 1; 909 ssize = strlen(name) + 1;
900 910
901 /* 911 /*
@@ -946,7 +956,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
946 */ 956 */
947 957
948 if (ssize <= IDATASIZE) { 958 if (ssize <= IDATASIZE) {
949 ip->i_op = &jfs_symlink_inode_operations; 959 ip->i_op = &jfs_fast_symlink_inode_operations;
950 960
951 i_fastsymlink = JFS_IP(ip)->i_inline; 961 i_fastsymlink = JFS_IP(ip)->i_inline;
952 memcpy(i_fastsymlink, name, ssize); 962 memcpy(i_fastsymlink, name, ssize);
@@ -968,7 +978,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
968 else { 978 else {
969 jfs_info("jfs_symlink: allocate extent ip:0x%p", ip); 979 jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
970 980
971 ip->i_op = &page_symlink_inode_operations; 981 ip->i_op = &jfs_symlink_inode_operations;
972 ip->i_mapping->a_ops = &jfs_aops; 982 ip->i_mapping->a_ops = &jfs_aops;
973 983
974 /* 984 /*
@@ -1087,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1087 jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, 1097 jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
1088 new_dentry->d_name.name); 1098 new_dentry->d_name.name);
1089 1099
1100 dquot_initialize(old_dir);
1101 dquot_initialize(new_dir);
1102
1090 old_ip = old_dentry->d_inode; 1103 old_ip = old_dentry->d_inode;
1091 new_ip = new_dentry->d_inode; 1104 new_ip = new_dentry->d_inode;
1092 1105
@@ -1136,7 +1149,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1136 } else if (new_ip) { 1149 } else if (new_ip) {
1137 IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); 1150 IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
1138 /* Init inode for quota operations. */ 1151 /* Init inode for quota operations. */
1139 vfs_dq_init(new_ip); 1152 dquot_initialize(new_ip);
1140 } 1153 }
1141 1154
1142 /* 1155 /*
@@ -1360,6 +1373,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
1360 1373
1361 jfs_info("jfs_mknod: %s", dentry->d_name.name); 1374 jfs_info("jfs_mknod: %s", dentry->d_name.name);
1362 1375
1376 dquot_initialize(dir);
1377
1363 if ((rc = get_UCSname(&dname, dentry))) 1378 if ((rc = get_UCSname(&dname, dentry)))
1364 goto out; 1379 goto out;
1365 1380
@@ -1541,8 +1556,8 @@ const struct inode_operations jfs_dir_inode_operations = {
1541 .getxattr = jfs_getxattr, 1556 .getxattr = jfs_getxattr,
1542 .listxattr = jfs_listxattr, 1557 .listxattr = jfs_listxattr,
1543 .removexattr = jfs_removexattr, 1558 .removexattr = jfs_removexattr,
1544#ifdef CONFIG_JFS_POSIX_ACL
1545 .setattr = jfs_setattr, 1559 .setattr = jfs_setattr,
1560#ifdef CONFIG_JFS_POSIX_ACL
1546 .check_acl = jfs_check_acl, 1561 .check_acl = jfs_check_acl,
1547#endif 1562#endif
1548}; 1563};
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 7f24a0bb08ca..1aba0039f1c9 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -81,6 +81,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
81 struct inode *iplist[1]; 81 struct inode *iplist[1];
82 struct jfs_superblock *j_sb, *j_sb2; 82 struct jfs_superblock *j_sb, *j_sb2;
83 uint old_agsize; 83 uint old_agsize;
84 int agsizechanged = 0;
84 struct buffer_head *bh, *bh2; 85 struct buffer_head *bh, *bh2;
85 86
86 /* If the volume hasn't grown, get out now */ 87 /* If the volume hasn't grown, get out now */
@@ -333,6 +334,9 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
333 */ 334 */
334 if ((rc = dbExtendFS(ipbmap, XAddress, nblocks))) 335 if ((rc = dbExtendFS(ipbmap, XAddress, nblocks)))
335 goto error_out; 336 goto error_out;
337
338 agsizechanged |= (bmp->db_agsize != old_agsize);
339
336 /* 340 /*
337 * the map now has extended to cover additional nblocks: 341 * the map now has extended to cover additional nblocks:
338 * dn_mapsize = oldMapsize + nblocks; 342 * dn_mapsize = oldMapsize + nblocks;
@@ -432,7 +436,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
432 * will correctly identify the new ag); 436 * will correctly identify the new ag);
433 */ 437 */
434 /* if new AG size the same as old AG size, done! */ 438 /* if new AG size the same as old AG size, done! */
435 if (bmp->db_agsize != old_agsize) { 439 if (agsizechanged) {
436 if ((rc = diExtendFS(ipimap, ipbmap))) 440 if ((rc = diExtendFS(ipimap, ipbmap)))
437 goto error_out; 441 goto error_out;
438 442
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index d929a822a74e..157382fa6256 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -30,6 +30,7 @@
30#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
31#include <linux/exportfs.h> 31#include <linux/exportfs.h>
32#include <linux/crc32.h> 32#include <linux/crc32.h>
33#include <linux/slab.h>
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
34#include <linux/seq_file.h> 35#include <linux/seq_file.h>
35#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
@@ -131,6 +132,11 @@ static void jfs_destroy_inode(struct inode *inode)
131 kmem_cache_free(jfs_inode_cachep, ji); 132 kmem_cache_free(jfs_inode_cachep, ji);
132} 133}
133 134
135static void jfs_clear_inode(struct inode *inode)
136{
137 dquot_drop(inode);
138}
139
134static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) 140static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
135{ 141{
136 struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb); 142 struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
@@ -745,6 +751,7 @@ static const struct super_operations jfs_super_operations = {
745 .dirty_inode = jfs_dirty_inode, 751 .dirty_inode = jfs_dirty_inode,
746 .write_inode = jfs_write_inode, 752 .write_inode = jfs_write_inode,
747 .delete_inode = jfs_delete_inode, 753 .delete_inode = jfs_delete_inode,
754 .clear_inode = jfs_clear_inode,
748 .put_super = jfs_put_super, 755 .put_super = jfs_put_super,
749 .sync_fs = jfs_sync_fs, 756 .sync_fs = jfs_sync_fs,
750 .freeze_fs = jfs_freeze, 757 .freeze_fs = jfs_freeze,
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 4af1a05aad0a..205b946d8e0d 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -29,9 +29,21 @@ static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd)
29 return NULL; 29 return NULL;
30} 30}
31 31
32const struct inode_operations jfs_symlink_inode_operations = { 32const struct inode_operations jfs_fast_symlink_inode_operations = {
33 .readlink = generic_readlink, 33 .readlink = generic_readlink,
34 .follow_link = jfs_follow_link, 34 .follow_link = jfs_follow_link,
35 .setattr = jfs_setattr,
36 .setxattr = jfs_setxattr,
37 .getxattr = jfs_getxattr,
38 .listxattr = jfs_listxattr,
39 .removexattr = jfs_removexattr,
40};
41
42const struct inode_operations jfs_symlink_inode_operations = {
43 .readlink = generic_readlink,
44 .follow_link = page_follow_link_light,
45 .put_link = page_put_link,
46 .setattr = jfs_setattr,
35 .setxattr = jfs_setxattr, 47 .setxattr = jfs_setxattr,
36 .getxattr = jfs_getxattr, 48 .getxattr = jfs_getxattr,
37 .listxattr = jfs_listxattr, 49 .listxattr = jfs_listxattr,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index fad364548bc9..fa96bbb26343 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -21,6 +21,7 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/xattr.h> 22#include <linux/xattr.h>
23#include <linux/posix_acl_xattr.h> 23#include <linux/posix_acl_xattr.h>
24#include <linux/slab.h>
24#include <linux/quotaops.h> 25#include <linux/quotaops.h>
25#include <linux/security.h> 26#include <linux/security.h>
26#include "jfs_incore.h" 27#include "jfs_incore.h"
@@ -260,14 +261,14 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
260 nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits; 261 nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits;
261 262
262 /* Allocate new blocks to quota. */ 263 /* Allocate new blocks to quota. */
263 if (vfs_dq_alloc_block(ip, nblocks)) { 264 rc = dquot_alloc_block(ip, nblocks);
264 return -EDQUOT; 265 if (rc)
265 } 266 return rc;
266 267
267 rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno); 268 rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno);
268 if (rc) { 269 if (rc) {
269 /*Rollback quota allocation. */ 270 /*Rollback quota allocation. */
270 vfs_dq_free_block(ip, nblocks); 271 dquot_free_block(ip, nblocks);
271 return rc; 272 return rc;
272 } 273 }
273 274
@@ -332,7 +333,7 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
332 333
333 failed: 334 failed:
334 /* Rollback quota allocation. */ 335 /* Rollback quota allocation. */
335 vfs_dq_free_block(ip, nblocks); 336 dquot_free_block(ip, nblocks);
336 337
337 dbFree(ip, blkno, nblocks); 338 dbFree(ip, blkno, nblocks);
338 return rc; 339 return rc;
@@ -538,7 +539,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
538 539
539 if (blocks_needed > current_blocks) { 540 if (blocks_needed > current_blocks) {
540 /* Allocate new blocks to quota. */ 541 /* Allocate new blocks to quota. */
541 if (vfs_dq_alloc_block(inode, blocks_needed)) 542 rc = dquot_alloc_block(inode, blocks_needed);
543 if (rc)
542 return -EDQUOT; 544 return -EDQUOT;
543 545
544 quota_allocation = blocks_needed; 546 quota_allocation = blocks_needed;
@@ -602,7 +604,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
602 clean_up: 604 clean_up:
603 /* Rollback quota allocation */ 605 /* Rollback quota allocation */
604 if (quota_allocation) 606 if (quota_allocation)
605 vfs_dq_free_block(inode, quota_allocation); 607 dquot_free_block(inode, quota_allocation);
606 608
607 return (rc); 609 return (rc);
608} 610}
@@ -677,7 +679,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
677 679
678 /* If old blocks exist, they must be removed from quota allocation. */ 680 /* If old blocks exist, they must be removed from quota allocation. */
679 if (old_blocks) 681 if (old_blocks)
680 vfs_dq_free_block(inode, old_blocks); 682 dquot_free_block(inode, old_blocks);
681 683
682 inode->i_ctime = CURRENT_TIME; 684 inode->i_ctime = CURRENT_TIME;
683 685
diff --git a/fs/libfs.c b/fs/libfs.c
index 6e8d17e1dc4c..ea9a6cc9b35c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -5,6 +5,7 @@
5 5
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/slab.h>
8#include <linux/mount.h> 9#include <linux/mount.h>
9#include <linux/vfs.h> 10#include <linux/vfs.h>
10#include <linux/mutex.h> 11#include <linux/mutex.h>
@@ -338,28 +339,14 @@ int simple_readpage(struct file *file, struct page *page)
338 return 0; 339 return 0;
339} 340}
340 341
341int simple_prepare_write(struct file *file, struct page *page,
342 unsigned from, unsigned to)
343{
344 if (!PageUptodate(page)) {
345 if (to - from != PAGE_CACHE_SIZE)
346 zero_user_segments(page,
347 0, from,
348 to, PAGE_CACHE_SIZE);
349 }
350 return 0;
351}
352
353int simple_write_begin(struct file *file, struct address_space *mapping, 342int simple_write_begin(struct file *file, struct address_space *mapping,
354 loff_t pos, unsigned len, unsigned flags, 343 loff_t pos, unsigned len, unsigned flags,
355 struct page **pagep, void **fsdata) 344 struct page **pagep, void **fsdata)
356{ 345{
357 struct page *page; 346 struct page *page;
358 pgoff_t index; 347 pgoff_t index;
359 unsigned from;
360 348
361 index = pos >> PAGE_CACHE_SHIFT; 349 index = pos >> PAGE_CACHE_SHIFT;
362 from = pos & (PAGE_CACHE_SIZE - 1);
363 350
364 page = grab_cache_page_write_begin(mapping, index, flags); 351 page = grab_cache_page_write_begin(mapping, index, flags);
365 if (!page) 352 if (!page)
@@ -367,43 +354,59 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
367 354
368 *pagep = page; 355 *pagep = page;
369 356
370 return simple_prepare_write(file, page, from, from+len); 357 if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
371} 358 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
372 359
373static int simple_commit_write(struct file *file, struct page *page, 360 zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
374 unsigned from, unsigned to) 361 }
375{
376 struct inode *inode = page->mapping->host;
377 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
378
379 if (!PageUptodate(page))
380 SetPageUptodate(page);
381 /*
382 * No need to use i_size_read() here, the i_size
383 * cannot change under us because we hold the i_mutex.
384 */
385 if (pos > inode->i_size)
386 i_size_write(inode, pos);
387 set_page_dirty(page);
388 return 0; 362 return 0;
389} 363}
390 364
365/**
366 * simple_write_end - .write_end helper for non-block-device FSes
367 * @available: See .write_end of address_space_operations
368 * @file: "
369 * @mapping: "
370 * @pos: "
371 * @len: "
372 * @copied: "
373 * @page: "
374 * @fsdata: "
375 *
376 * simple_write_end does the minimum needed for updating a page after writing is
377 * done. It has the same API signature as the .write_end of
378 * address_space_operations vector. So it can just be set onto .write_end for
379 * FSes that don't need any other processing. i_mutex is assumed to be held.
380 * Block based filesystems should use generic_write_end().
381 * NOTE: Even though i_size might get updated by this function, mark_inode_dirty
382 * is not called, so a filesystem that actually does store data in .write_inode
383 * should extend on what's done here with a call to mark_inode_dirty() in the
384 * case that i_size has changed.
385 */
391int simple_write_end(struct file *file, struct address_space *mapping, 386int simple_write_end(struct file *file, struct address_space *mapping,
392 loff_t pos, unsigned len, unsigned copied, 387 loff_t pos, unsigned len, unsigned copied,
393 struct page *page, void *fsdata) 388 struct page *page, void *fsdata)
394{ 389{
395 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 390 struct inode *inode = page->mapping->host;
391 loff_t last_pos = pos + copied;
396 392
397 /* zero the stale part of the page if we did a short copy */ 393 /* zero the stale part of the page if we did a short copy */
398 if (copied < len) { 394 if (copied < len) {
399 void *kaddr = kmap_atomic(page, KM_USER0); 395 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
400 memset(kaddr + from + copied, 0, len - copied); 396
401 flush_dcache_page(page); 397 zero_user(page, from + copied, len - copied);
402 kunmap_atomic(kaddr, KM_USER0);
403 } 398 }
404 399
405 simple_commit_write(file, page, from, from+copied); 400 if (!PageUptodate(page))
401 SetPageUptodate(page);
402 /*
403 * No need to use i_size_read() here, the i_size
404 * cannot change under us because we hold the i_mutex.
405 */
406 if (last_pos > inode->i_size)
407 i_size_write(inode, last_pos);
406 408
409 set_page_dirty(page);
407 unlock_page(page); 410 unlock_page(page);
408 page_cache_release(page); 411 page_cache_release(page);
409 412
@@ -853,7 +856,6 @@ EXPORT_SYMBOL(simple_getattr);
853EXPORT_SYMBOL(simple_link); 856EXPORT_SYMBOL(simple_link);
854EXPORT_SYMBOL(simple_lookup); 857EXPORT_SYMBOL(simple_lookup);
855EXPORT_SYMBOL(simple_pin_fs); 858EXPORT_SYMBOL(simple_pin_fs);
856EXPORT_UNUSED_SYMBOL(simple_prepare_write);
857EXPORT_SYMBOL(simple_readpage); 859EXPORT_SYMBOL(simple_readpage);
858EXPORT_SYMBOL(simple_release_fs); 860EXPORT_SYMBOL(simple_release_fs);
859EXPORT_SYMBOL(simple_rename); 861EXPORT_SYMBOL(simple_rename);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index fc9032dc8862..64fd427c993c 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/slab.h>
11#include <linux/time.h> 12#include <linux/time.h>
12#include <linux/nfs_fs.h> 13#include <linux/nfs_fs.h>
13#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index c81249fef11f..7932c399fab4 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/smp_lock.h> 10#include <linux/smp_lock.h>
11#include <linux/slab.h>
11#include <linux/types.h> 12#include <linux/types.h>
12#include <linux/errno.h> 13#include <linux/errno.h>
13#include <linux/fs.h> 14#include <linux/fs.h>
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 4600c2037b8b..bb464d12104c 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -479,8 +479,8 @@ again: mutex_lock(&nlm_host_mutex);
479 } 479 }
480 } 480 }
481 } 481 }
482
483 mutex_unlock(&nlm_host_mutex); 482 mutex_unlock(&nlm_host_mutex);
483 nsm_release(nsm);
484} 484}
485 485
486/* 486/*
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index f956651d0f65..e3015464fbab 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -10,6 +10,7 @@
10#include <linux/utsname.h> 10#include <linux/utsname.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/ktime.h> 12#include <linux/ktime.h>
13#include <linux/slab.h>
13 14
14#include <linux/sunrpc/clnt.h> 15#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/xprtsock.h> 16#include <linux/sunrpc/xprtsock.h>
@@ -349,9 +350,9 @@ retry:
349 * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle 350 * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
350 * @info: pointer to NLMPROC_SM_NOTIFY arguments 351 * @info: pointer to NLMPROC_SM_NOTIFY arguments
351 * 352 *
352 * Returns a matching nsm_handle if found in the nsm cache; the returned 353 * Returns a matching nsm_handle if found in the nsm cache. The returned
353 * nsm_handle's reference count is bumped and sm_monitored is cleared. 354 * nsm_handle's reference count is bumped. Otherwise returns NULL if some
354 * Otherwise returns NULL if some error occurred. 355 * error occurred.
355 */ 356 */
356struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info) 357struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
357{ 358{
@@ -370,12 +371,6 @@ struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
370 atomic_inc(&cached->sm_count); 371 atomic_inc(&cached->sm_count);
371 spin_unlock(&nsm_lock); 372 spin_unlock(&nsm_lock);
372 373
373 /*
374 * During subsequent lock activity, force a fresh
375 * notification to be set up for this host.
376 */
377 cached->sm_monitored = 0;
378
379 dprintk("lockd: host %s (%s) rebooted, cnt %d\n", 374 dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
380 cached->sm_name, cached->sm_addrbuf, 375 cached->sm_name, cached->sm_addrbuf,
381 atomic_read(&cached->sm_count)); 376 atomic_read(&cached->sm_count));
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index e50cfa3d9654..f1bacf1a0391 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -21,7 +21,6 @@
21#include <linux/errno.h> 21#include <linux/errno.h>
22#include <linux/in.h> 22#include <linux/in.h>
23#include <linux/uio.h> 23#include <linux/uio.h>
24#include <linux/slab.h>
25#include <linux/smp.h> 24#include <linux/smp.h>
26#include <linux/smp_lock.h> 25#include <linux/smp_lock.h>
27#include <linux/mutex.h> 26#include <linux/mutex.h>
@@ -243,11 +242,9 @@ static int make_socks(struct svc_serv *serv)
243 if (err < 0) 242 if (err < 0)
244 goto out_err; 243 goto out_err;
245 244
246#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
247 err = create_lockd_family(serv, PF_INET6); 245 err = create_lockd_family(serv, PF_INET6);
248 if (err < 0 && err != -EAFNOSUPPORT) 246 if (err < 0 && err != -EAFNOSUPPORT)
249 goto out_err; 247 goto out_err;
250#endif /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */
251 248
252 warned = 0; 249 warned = 0;
253 return 0; 250 return 0;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a7966eed3c17..031c6569a134 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/slab.h>
13#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
14#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
15#include <linux/lockd/share.h> 14#include <linux/lockd/share.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d1001790fa9a..84055d31bfc5 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/types.h> 23#include <linux/types.h>
24#include <linux/slab.h>
24#include <linux/errno.h> 25#include <linux/errno.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/sched.h> 27#include <linux/sched.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 56c9519d900a..0f2ab741ae7c 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/slab.h>
13#include <linux/smp_lock.h> 12#include <linux/smp_lock.h>
14#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
15#include <linux/lockd/share.h> 14#include <linux/lockd/share.h>
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index ad478da7ca63..d0ef94cfb3da 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -10,6 +10,7 @@
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/in.h> 12#include <linux/in.h>
13#include <linux/slab.h>
13#include <linux/mutex.h> 14#include <linux/mutex.h>
14#include <linux/sunrpc/svc.h> 15#include <linux/sunrpc/svc.h>
15#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
diff --git a/fs/locks.c b/fs/locks.c
index a8794f233bc9..ab24d49fc048 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1182,8 +1182,9 @@ int __break_lease(struct inode *inode, unsigned int mode)
1182 struct file_lock *fl; 1182 struct file_lock *fl;
1183 unsigned long break_time; 1183 unsigned long break_time;
1184 int i_have_this_lease = 0; 1184 int i_have_this_lease = 0;
1185 int want_write = (mode & O_ACCMODE) != O_RDONLY;
1185 1186
1186 new_fl = lease_alloc(NULL, mode & FMODE_WRITE ? F_WRLCK : F_RDLCK); 1187 new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
1187 1188
1188 lock_kernel(); 1189 lock_kernel();
1189 1190
@@ -1197,7 +1198,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
1197 if (fl->fl_owner == current->files) 1198 if (fl->fl_owner == current->files)
1198 i_have_this_lease = 1; 1199 i_have_this_lease = 1;
1199 1200
1200 if (mode & FMODE_WRITE) { 1201 if (want_write) {
1201 /* If we want write access, we have to revoke any lease. */ 1202 /* If we want write access, we have to revoke any lease. */
1202 future = F_UNLCK | F_INPROGRESS; 1203 future = F_UNLCK | F_INPROGRESS;
1203 } else if (flock->fl_type & F_INPROGRESS) { 1204 } else if (flock->fl_type & F_INPROGRESS) {
@@ -1454,7 +1455,7 @@ EXPORT_SYMBOL(generic_setlease);
1454 * leases held by processes on this node. 1455 * leases held by processes on this node.
1455 * 1456 *
1456 * There is also no break_lease method; filesystems that 1457 * There is also no break_lease method; filesystems that
1457 * handle their own leases shoud break leases themselves from the 1458 * handle their own leases should break leases themselves from the
1458 * filesystem's open, create, and (on truncate) setattr methods. 1459 * filesystem's open, create, and (on truncate) setattr methods.
1459 * 1460 *
1460 * Warning: the only current setlease methods exist only to disable 1461 * Warning: the only current setlease methods exist only to disable
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
new file mode 100644
index 000000000000..daf9a9b32dd3
--- /dev/null
+++ b/fs/logfs/Kconfig
@@ -0,0 +1,17 @@
1config LOGFS
2 tristate "LogFS file system (EXPERIMENTAL)"
3 depends on (MTD || BLOCK) && EXPERIMENTAL
4 select ZLIB_INFLATE
5 select ZLIB_DEFLATE
6 select CRC32
7 select BTREE
8 help
9 Flash filesystem aimed to scale efficiently to large devices.
10 In comparison to JFFS2 it offers significantly faster mount
11 times and potentially less RAM usage, although the latter has
12 not been measured yet.
13
14 In its current state it is still very experimental and should
15 not be used for other than testing purposes.
16
17 If unsure, say N.
diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile
new file mode 100644
index 000000000000..4820027787ee
--- /dev/null
+++ b/fs/logfs/Makefile
@@ -0,0 +1,13 @@
1obj-$(CONFIG_LOGFS) += logfs.o
2
3logfs-y += compr.o
4logfs-y += dir.o
5logfs-y += file.o
6logfs-y += gc.o
7logfs-y += inode.o
8logfs-y += journal.o
9logfs-y += readwrite.o
10logfs-y += segment.o
11logfs-y += super.o
12logfs-$(CONFIG_BLOCK) += dev_bdev.o
13logfs-$(CONFIG_MTD) += dev_mtd.o
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
new file mode 100644
index 000000000000..44bbfd249abc
--- /dev/null
+++ b/fs/logfs/compr.c
@@ -0,0 +1,95 @@
1/*
2 * fs/logfs/compr.c - compression routines
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/vmalloc.h>
10#include <linux/zlib.h>
11
12#define COMPR_LEVEL 3
13
14static DEFINE_MUTEX(compr_mutex);
15static struct z_stream_s stream;
16
17int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
18{
19 int err, ret;
20
21 ret = -EIO;
22 mutex_lock(&compr_mutex);
23 err = zlib_deflateInit(&stream, COMPR_LEVEL);
24 if (err != Z_OK)
25 goto error;
26
27 stream.next_in = in;
28 stream.avail_in = inlen;
29 stream.total_in = 0;
30 stream.next_out = out;
31 stream.avail_out = outlen;
32 stream.total_out = 0;
33
34 err = zlib_deflate(&stream, Z_FINISH);
35 if (err != Z_STREAM_END)
36 goto error;
37
38 err = zlib_deflateEnd(&stream);
39 if (err != Z_OK)
40 goto error;
41
42 if (stream.total_out >= stream.total_in)
43 goto error;
44
45 ret = stream.total_out;
46error:
47 mutex_unlock(&compr_mutex);
48 return ret;
49}
50
51int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
52{
53 int err, ret;
54
55 ret = -EIO;
56 mutex_lock(&compr_mutex);
57 err = zlib_inflateInit(&stream);
58 if (err != Z_OK)
59 goto error;
60
61 stream.next_in = in;
62 stream.avail_in = inlen;
63 stream.total_in = 0;
64 stream.next_out = out;
65 stream.avail_out = outlen;
66 stream.total_out = 0;
67
68 err = zlib_inflate(&stream, Z_FINISH);
69 if (err != Z_STREAM_END)
70 goto error;
71
72 err = zlib_inflateEnd(&stream);
73 if (err != Z_OK)
74 goto error;
75
76 ret = 0;
77error:
78 mutex_unlock(&compr_mutex);
79 return ret;
80}
81
82int __init logfs_compr_init(void)
83{
84 size_t size = max(zlib_deflate_workspacesize(),
85 zlib_inflate_workspacesize());
86 stream.workspace = vmalloc(size);
87 if (!stream.workspace)
88 return -ENOMEM;
89 return 0;
90}
91
92void logfs_compr_exit(void)
93{
94 vfree(stream.workspace);
95}
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
new file mode 100644
index 000000000000..243c00071f76
--- /dev/null
+++ b/fs/logfs/dev_bdev.c
@@ -0,0 +1,333 @@
1/*
2 * fs/logfs/dev_bdev.c - Device access methods for block devices
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/bio.h>
10#include <linux/blkdev.h>
11#include <linux/buffer_head.h>
12#include <linux/gfp.h>
13
14#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
15
16static void request_complete(struct bio *bio, int err)
17{
18 complete((struct completion *)bio->bi_private);
19}
20
21static int sync_request(struct page *page, struct block_device *bdev, int rw)
22{
23 struct bio bio;
24 struct bio_vec bio_vec;
25 struct completion complete;
26
27 bio_init(&bio);
28 bio.bi_io_vec = &bio_vec;
29 bio_vec.bv_page = page;
30 bio_vec.bv_len = PAGE_SIZE;
31 bio_vec.bv_offset = 0;
32 bio.bi_vcnt = 1;
33 bio.bi_idx = 0;
34 bio.bi_size = PAGE_SIZE;
35 bio.bi_bdev = bdev;
36 bio.bi_sector = page->index * (PAGE_SIZE >> 9);
37 init_completion(&complete);
38 bio.bi_private = &complete;
39 bio.bi_end_io = request_complete;
40
41 submit_bio(rw, &bio);
42 generic_unplug_device(bdev_get_queue(bdev));
43 wait_for_completion(&complete);
44 return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
45}
46
47static int bdev_readpage(void *_sb, struct page *page)
48{
49 struct super_block *sb = _sb;
50 struct block_device *bdev = logfs_super(sb)->s_bdev;
51 int err;
52
53 err = sync_request(page, bdev, READ);
54 if (err) {
55 ClearPageUptodate(page);
56 SetPageError(page);
57 } else {
58 SetPageUptodate(page);
59 ClearPageError(page);
60 }
61 unlock_page(page);
62 return err;
63}
64
65static DECLARE_WAIT_QUEUE_HEAD(wq);
66
67static void writeseg_end_io(struct bio *bio, int err)
68{
69 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
70 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
71 struct super_block *sb = bio->bi_private;
72 struct logfs_super *super = logfs_super(sb);
73 struct page *page;
74
75 BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
76 BUG_ON(err);
77 BUG_ON(bio->bi_vcnt == 0);
78 do {
79 page = bvec->bv_page;
80 if (--bvec >= bio->bi_io_vec)
81 prefetchw(&bvec->bv_page->flags);
82
83 end_page_writeback(page);
84 page_cache_release(page);
85 } while (bvec >= bio->bi_io_vec);
86 bio_put(bio);
87 if (atomic_dec_and_test(&super->s_pending_writes))
88 wake_up(&wq);
89}
90
91static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
92 size_t nr_pages)
93{
94 struct logfs_super *super = logfs_super(sb);
95 struct address_space *mapping = super->s_mapping_inode->i_mapping;
96 struct bio *bio;
97 struct page *page;
98 struct request_queue *q = bdev_get_queue(sb->s_bdev);
99 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
100 int i;
101
102 if (max_pages > BIO_MAX_PAGES)
103 max_pages = BIO_MAX_PAGES;
104 bio = bio_alloc(GFP_NOFS, max_pages);
105 BUG_ON(!bio);
106
107 for (i = 0; i < nr_pages; i++) {
108 if (i >= max_pages) {
109 /* Block layer cannot split bios :( */
110 bio->bi_vcnt = i;
111 bio->bi_idx = 0;
112 bio->bi_size = i * PAGE_SIZE;
113 bio->bi_bdev = super->s_bdev;
114 bio->bi_sector = ofs >> 9;
115 bio->bi_private = sb;
116 bio->bi_end_io = writeseg_end_io;
117 atomic_inc(&super->s_pending_writes);
118 submit_bio(WRITE, bio);
119
120 ofs += i * PAGE_SIZE;
121 index += i;
122 nr_pages -= i;
123 i = 0;
124
125 bio = bio_alloc(GFP_NOFS, max_pages);
126 BUG_ON(!bio);
127 }
128 page = find_lock_page(mapping, index + i);
129 BUG_ON(!page);
130 bio->bi_io_vec[i].bv_page = page;
131 bio->bi_io_vec[i].bv_len = PAGE_SIZE;
132 bio->bi_io_vec[i].bv_offset = 0;
133
134 BUG_ON(PageWriteback(page));
135 set_page_writeback(page);
136 unlock_page(page);
137 }
138 bio->bi_vcnt = nr_pages;
139 bio->bi_idx = 0;
140 bio->bi_size = nr_pages * PAGE_SIZE;
141 bio->bi_bdev = super->s_bdev;
142 bio->bi_sector = ofs >> 9;
143 bio->bi_private = sb;
144 bio->bi_end_io = writeseg_end_io;
145 atomic_inc(&super->s_pending_writes);
146 submit_bio(WRITE, bio);
147 return 0;
148}
149
150static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
151{
152 struct logfs_super *super = logfs_super(sb);
153 int head;
154
155 BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO);
156
157 if (len == 0) {
158 /* This can happen when the object fit perfectly into a
159 * segment, the segment gets written per sync and subsequently
160 * closed.
161 */
162 return;
163 }
164 head = ofs & (PAGE_SIZE - 1);
165 if (head) {
166 ofs -= head;
167 len += head;
168 }
169 len = PAGE_ALIGN(len);
170 __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
171 generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
172}
173
174
175static void erase_end_io(struct bio *bio, int err)
176{
177 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
178 struct super_block *sb = bio->bi_private;
179 struct logfs_super *super = logfs_super(sb);
180
181 BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
182 BUG_ON(err);
183 BUG_ON(bio->bi_vcnt == 0);
184 bio_put(bio);
185 if (atomic_dec_and_test(&super->s_pending_writes))
186 wake_up(&wq);
187}
188
189static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
190 size_t nr_pages)
191{
192 struct logfs_super *super = logfs_super(sb);
193 struct bio *bio;
194 struct request_queue *q = bdev_get_queue(sb->s_bdev);
195 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
196 int i;
197
198 if (max_pages > BIO_MAX_PAGES)
199 max_pages = BIO_MAX_PAGES;
200 bio = bio_alloc(GFP_NOFS, max_pages);
201 BUG_ON(!bio);
202
203 for (i = 0; i < nr_pages; i++) {
204 if (i >= max_pages) {
205 /* Block layer cannot split bios :( */
206 bio->bi_vcnt = i;
207 bio->bi_idx = 0;
208 bio->bi_size = i * PAGE_SIZE;
209 bio->bi_bdev = super->s_bdev;
210 bio->bi_sector = ofs >> 9;
211 bio->bi_private = sb;
212 bio->bi_end_io = erase_end_io;
213 atomic_inc(&super->s_pending_writes);
214 submit_bio(WRITE, bio);
215
216 ofs += i * PAGE_SIZE;
217 index += i;
218 nr_pages -= i;
219 i = 0;
220
221 bio = bio_alloc(GFP_NOFS, max_pages);
222 BUG_ON(!bio);
223 }
224 bio->bi_io_vec[i].bv_page = super->s_erase_page;
225 bio->bi_io_vec[i].bv_len = PAGE_SIZE;
226 bio->bi_io_vec[i].bv_offset = 0;
227 }
228 bio->bi_vcnt = nr_pages;
229 bio->bi_idx = 0;
230 bio->bi_size = nr_pages * PAGE_SIZE;
231 bio->bi_bdev = super->s_bdev;
232 bio->bi_sector = ofs >> 9;
233 bio->bi_private = sb;
234 bio->bi_end_io = erase_end_io;
235 atomic_inc(&super->s_pending_writes);
236 submit_bio(WRITE, bio);
237 return 0;
238}
239
240static int bdev_erase(struct super_block *sb, loff_t to, size_t len,
241 int ensure_write)
242{
243 struct logfs_super *super = logfs_super(sb);
244
245 BUG_ON(to & (PAGE_SIZE - 1));
246 BUG_ON(len & (PAGE_SIZE - 1));
247
248 if (super->s_flags & LOGFS_SB_FLAG_RO)
249 return -EROFS;
250
251 if (ensure_write) {
252 /*
253 * Object store doesn't care whether erases happen or not.
254 * But for the journal they are required. Otherwise a scan
255 * can find an old commit entry and assume it is the current
256 * one, travelling back in time.
257 */
258 do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT);
259 }
260
261 return 0;
262}
263
264static void bdev_sync(struct super_block *sb)
265{
266 struct logfs_super *super = logfs_super(sb);
267
268 wait_event(wq, atomic_read(&super->s_pending_writes) == 0);
269}
270
271static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs)
272{
273 struct logfs_super *super = logfs_super(sb);
274 struct address_space *mapping = super->s_mapping_inode->i_mapping;
275 filler_t *filler = bdev_readpage;
276
277 *ofs = 0;
278 return read_cache_page(mapping, 0, filler, sb);
279}
280
281static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs)
282{
283 struct logfs_super *super = logfs_super(sb);
284 struct address_space *mapping = super->s_mapping_inode->i_mapping;
285 filler_t *filler = bdev_readpage;
286 u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000;
287 pgoff_t index = pos >> PAGE_SHIFT;
288
289 *ofs = pos;
290 return read_cache_page(mapping, index, filler, sb);
291}
292
293static int bdev_write_sb(struct super_block *sb, struct page *page)
294{
295 struct block_device *bdev = logfs_super(sb)->s_bdev;
296
297 /* Nothing special to do for block devices. */
298 return sync_request(page, bdev, WRITE);
299}
300
301static void bdev_put_device(struct super_block *sb)
302{
303 close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
304}
305
306static const struct logfs_device_ops bd_devops = {
307 .find_first_sb = bdev_find_first_sb,
308 .find_last_sb = bdev_find_last_sb,
309 .write_sb = bdev_write_sb,
310 .readpage = bdev_readpage,
311 .writeseg = bdev_writeseg,
312 .erase = bdev_erase,
313 .sync = bdev_sync,
314 .put_device = bdev_put_device,
315};
316
317int logfs_get_sb_bdev(struct file_system_type *type, int flags,
318 const char *devname, struct vfsmount *mnt)
319{
320 struct block_device *bdev;
321
322 bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
323 if (IS_ERR(bdev))
324 return PTR_ERR(bdev);
325
326 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
327 int mtdnr = MINOR(bdev->bd_dev);
328 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
329 return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
330 }
331
332 return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt);
333}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
new file mode 100644
index 000000000000..cafb6ef2e05b
--- /dev/null
+++ b/fs/logfs/dev_mtd.c
@@ -0,0 +1,254 @@
1/*
2 * fs/logfs/dev_mtd.c - Device access methods for MTD
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/completion.h>
10#include <linux/mount.h>
11#include <linux/sched.h>
12
13#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
14
15static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
16{
17 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
18 size_t retlen;
19 int ret;
20
21 ret = mtd->read(mtd, ofs, len, &retlen, buf);
22 BUG_ON(ret == -EINVAL);
23 if (ret)
24 return ret;
25
26 /* Not sure if we should loop instead. */
27 if (retlen != len)
28 return -EIO;
29
30 return 0;
31}
32
33static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
34{
35 struct logfs_super *super = logfs_super(sb);
36 struct mtd_info *mtd = super->s_mtd;
37 size_t retlen;
38 loff_t page_start, page_end;
39 int ret;
40
41 if (super->s_flags & LOGFS_SB_FLAG_RO)
42 return -EROFS;
43
44 BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
45 BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
46 BUG_ON(len > PAGE_CACHE_SIZE);
47 page_start = ofs & PAGE_CACHE_MASK;
48 page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
49 ret = mtd->write(mtd, ofs, len, &retlen, buf);
50 if (ret || (retlen != len))
51 return -EIO;
52
53 return 0;
54}
55
56/*
57 * For as long as I can remember (since about 2001) mtd->erase has been an
58 * asynchronous interface lacking the first driver to actually use the
59 * asynchronous properties. So just to prevent the first implementor of such
60 * a thing from breaking logfs in 2350, we do the usual pointless dance to
61 * declare a completion variable and wait for completion before returning
62 * from mtd_erase(). What an excercise in futility!
63 */
64static void logfs_erase_callback(struct erase_info *ei)
65{
66 complete((struct completion *)ei->priv);
67}
68
69static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
70{
71 struct logfs_super *super = logfs_super(sb);
72 struct address_space *mapping = super->s_mapping_inode->i_mapping;
73 struct page *page;
74 pgoff_t index = ofs >> PAGE_SHIFT;
75
76 for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) {
77 page = find_get_page(mapping, index);
78 if (!page)
79 continue;
80 memset(page_address(page), 0xFF, PAGE_SIZE);
81 page_cache_release(page);
82 }
83 return 0;
84}
85
86static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
87 int ensure_write)
88{
89 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
90 struct erase_info ei;
91 DECLARE_COMPLETION_ONSTACK(complete);
92 int ret;
93
94 BUG_ON(len % mtd->erasesize);
95 if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
96 return -EROFS;
97
98 memset(&ei, 0, sizeof(ei));
99 ei.mtd = mtd;
100 ei.addr = ofs;
101 ei.len = len;
102 ei.callback = logfs_erase_callback;
103 ei.priv = (long)&complete;
104 ret = mtd->erase(mtd, &ei);
105 if (ret)
106 return -EIO;
107
108 wait_for_completion(&complete);
109 if (ei.state != MTD_ERASE_DONE)
110 return -EIO;
111 return mtd_erase_mapping(sb, ofs, len);
112}
113
114static void mtd_sync(struct super_block *sb)
115{
116 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
117
118 if (mtd->sync)
119 mtd->sync(mtd);
120}
121
122static int mtd_readpage(void *_sb, struct page *page)
123{
124 struct super_block *sb = _sb;
125 int err;
126
127 err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
128 page_address(page));
129 if (err == -EUCLEAN) {
130 err = 0;
131 /* FIXME: force GC this segment */
132 }
133 if (err) {
134 ClearPageUptodate(page);
135 SetPageError(page);
136 } else {
137 SetPageUptodate(page);
138 ClearPageError(page);
139 }
140 unlock_page(page);
141 return err;
142}
143
144static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
145{
146 struct logfs_super *super = logfs_super(sb);
147 struct address_space *mapping = super->s_mapping_inode->i_mapping;
148 filler_t *filler = mtd_readpage;
149 struct mtd_info *mtd = super->s_mtd;
150
151 if (!mtd->block_isbad)
152 return NULL;
153
154 *ofs = 0;
155 while (mtd->block_isbad(mtd, *ofs)) {
156 *ofs += mtd->erasesize;
157 if (*ofs >= mtd->size)
158 return NULL;
159 }
160 BUG_ON(*ofs & ~PAGE_MASK);
161 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
162}
163
164static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
165{
166 struct logfs_super *super = logfs_super(sb);
167 struct address_space *mapping = super->s_mapping_inode->i_mapping;
168 filler_t *filler = mtd_readpage;
169 struct mtd_info *mtd = super->s_mtd;
170
171 if (!mtd->block_isbad)
172 return NULL;
173
174 *ofs = mtd->size - mtd->erasesize;
175 while (mtd->block_isbad(mtd, *ofs)) {
176 *ofs -= mtd->erasesize;
177 if (*ofs <= 0)
178 return NULL;
179 }
180 *ofs = *ofs + mtd->erasesize - 0x1000;
181 BUG_ON(*ofs & ~PAGE_MASK);
182 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
183}
184
185static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
186 size_t nr_pages)
187{
188 struct logfs_super *super = logfs_super(sb);
189 struct address_space *mapping = super->s_mapping_inode->i_mapping;
190 struct page *page;
191 int i, err;
192
193 for (i = 0; i < nr_pages; i++) {
194 page = find_lock_page(mapping, index + i);
195 BUG_ON(!page);
196
197 err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
198 page_address(page));
199 unlock_page(page);
200 page_cache_release(page);
201 if (err)
202 return err;
203 }
204 return 0;
205}
206
207static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
208{
209 struct logfs_super *super = logfs_super(sb);
210 int head;
211
212 if (super->s_flags & LOGFS_SB_FLAG_RO)
213 return;
214
215 if (len == 0) {
216 /* This can happen when the object fit perfectly into a
217 * segment, the segment gets written per sync and subsequently
218 * closed.
219 */
220 return;
221 }
222 head = ofs & (PAGE_SIZE - 1);
223 if (head) {
224 ofs -= head;
225 len += head;
226 }
227 len = PAGE_ALIGN(len);
228 __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
229}
230
231static void mtd_put_device(struct super_block *sb)
232{
233 put_mtd_device(logfs_super(sb)->s_mtd);
234}
235
236static const struct logfs_device_ops mtd_devops = {
237 .find_first_sb = mtd_find_first_sb,
238 .find_last_sb = mtd_find_last_sb,
239 .readpage = mtd_readpage,
240 .writeseg = mtd_writeseg,
241 .erase = mtd_erase,
242 .sync = mtd_sync,
243 .put_device = mtd_put_device,
244};
245
246int logfs_get_sb_mtd(struct file_system_type *type, int flags,
247 int mtdnr, struct vfsmount *mnt)
248{
249 struct mtd_info *mtd;
250 const struct logfs_device_ops *devops = &mtd_devops;
251
252 mtd = get_mtd_device(NULL, mtdnr);
253 return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
254}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
new file mode 100644
index 000000000000..2396a85c0f55
--- /dev/null
+++ b/fs/logfs/dir.c
@@ -0,0 +1,827 @@
1/*
2 * fs/logfs/dir.c - directory-related code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/slab.h>
10
11/*
12 * Atomic dir operations
13 *
14 * Directory operations are by default not atomic. Dentries and Inodes are
15 * created/removed/altered in seperate operations. Therefore we need to do
16 * a small amount of journaling.
17 *
18 * Create, link, mkdir, mknod and symlink all share the same function to do
19 * the work: __logfs_create. This function works in two atomic steps:
20 * 1. allocate inode (remember in journal)
21 * 2. allocate dentry (clear journal)
22 *
23 * As we can only get interrupted between the two, when the inode we just
24 * created is simply stored in the anchor. On next mount, if we were
25 * interrupted, we delete the inode. From a users point of view the
26 * operation never happened.
27 *
28 * Unlink and rmdir also share the same function: unlink. Again, this
29 * function works in two atomic steps
30 * 1. remove dentry (remember inode in journal)
31 * 2. unlink inode (clear journal)
32 *
33 * And again, on the next mount, if we were interrupted, we delete the inode.
34 * From a users point of view the operation succeeded.
35 *
36 * Rename is the real pain to deal with, harder than all the other methods
37 * combined. Depending on the circumstances we can run into three cases.
38 * A "target rename" where the target dentry already existed, a "local
39 * rename" where both parent directories are identical or a "cross-directory
40 * rename" in the remaining case.
41 *
42 * Local rename is atomic, as the old dentry is simply rewritten with a new
43 * name.
44 *
45 * Cross-directory rename works in two steps, similar to __logfs_create and
46 * logfs_unlink:
47 * 1. Write new dentry (remember old dentry in journal)
48 * 2. Remove old dentry (clear journal)
49 *
50 * Here we remember a dentry instead of an inode. On next mount, if we were
51 * interrupted, we delete the dentry. From a users point of view, the
52 * operation succeeded.
53 *
54 * Target rename works in three atomic steps:
55 * 1. Attach old inode to new dentry (remember old dentry and new inode)
56 * 2. Remove old dentry (still remember the new inode)
57 * 3. Remove victim inode
58 *
59 * Here we remember both an inode an a dentry. If we get interrupted
60 * between steps 1 and 2, we delete both the dentry and the inode. If
61 * we get interrupted between steps 2 and 3, we delete just the inode.
62 * In either case, the remaining objects are deleted on next mount. From
63 * a users point of view, the operation succeeded.
64 */
65
66static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
67 loff_t pos)
68{
69 return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL);
70}
71
72static int write_inode(struct inode *inode)
73{
74 return __logfs_write_inode(inode, WF_LOCK);
75}
76
77static s64 dir_seek_data(struct inode *inode, s64 pos)
78{
79 s64 new_pos = logfs_seek_data(inode, pos);
80
81 return max(pos, new_pos - 1);
82}
83
84static int beyond_eof(struct inode *inode, loff_t bix)
85{
86 loff_t pos = bix << inode->i_sb->s_blocksize_bits;
87 return pos >= i_size_read(inode);
88}
89
90/*
91 * Prime value was chosen to be roughly 256 + 26. r5 hash uses 11,
92 * so short names (len <= 9) don't even occupy the complete 32bit name
93 * space. A prime >256 ensures short names quickly spread the 32bit
94 * name space. Add about 26 for the estimated amount of information
95 * of each character and pick a prime nearby, preferrably a bit-sparse
96 * one.
97 */
98static u32 hash_32(const char *s, int len, u32 seed)
99{
100 u32 hash = seed;
101 int i;
102
103 for (i = 0; i < len; i++)
104 hash = hash * 293 + s[i];
105 return hash;
106}
107
108/*
109 * We have to satisfy several conflicting requirements here. Small
110 * directories should stay fairly compact and not require too many
111 * indirect blocks. The number of possible locations for a given hash
112 * should be small to make lookup() fast. And we should try hard not
113 * to overflow the 32bit name space or nfs and 32bit host systems will
114 * be unhappy.
115 *
116 * So we use the following scheme. First we reduce the hash to 0..15
117 * and try a direct block. If that is occupied we reduce the hash to
118 * 16..255 and try an indirect block. Same for 2x and 3x indirect
119 * blocks. Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff,
120 * but use buckets containing eight entries instead of a single one.
121 *
122 * Using 16 entries should allow for a reasonable amount of hash
123 * collisions, so the 32bit name space can be packed fairly tight
124 * before overflowing. Oh and currently we don't overflow but return
125 * and error.
126 *
127 * How likely are collisions? Doing the appropriate math is beyond me
128 * and the Bronstein textbook. But running a test program to brute
129 * force collisions for a couple of days showed that on average the
130 * first collision occurs after 598M entries, with 290M being the
131 * smallest result. Obviously 21 entries could already cause a
132 * collision if all entries are carefully chosen.
133 */
134static pgoff_t hash_index(u32 hash, int round)
135{
136 u32 i0_blocks = I0_BLOCKS;
137 u32 i1_blocks = I1_BLOCKS;
138 u32 i2_blocks = I2_BLOCKS;
139 u32 i3_blocks = I3_BLOCKS;
140
141 switch (round) {
142 case 0:
143 return hash % i0_blocks;
144 case 1:
145 return i0_blocks + hash % (i1_blocks - i0_blocks);
146 case 2:
147 return i1_blocks + hash % (i2_blocks - i1_blocks);
148 case 3:
149 return i2_blocks + hash % (i3_blocks - i2_blocks);
150 case 4 ... 19:
151 return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16))
152 + round - 4;
153 }
154 BUG();
155}
156
157static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
158{
159 struct qstr *name = &dentry->d_name;
160 struct page *page;
161 struct logfs_disk_dentry *dd;
162 u32 hash = hash_32(name->name, name->len, 0);
163 pgoff_t index;
164 int round;
165
166 if (name->len > LOGFS_MAX_NAMELEN)
167 return ERR_PTR(-ENAMETOOLONG);
168
169 for (round = 0; round < 20; round++) {
170 index = hash_index(hash, round);
171
172 if (beyond_eof(dir, index))
173 return NULL;
174 if (!logfs_exist_block(dir, index))
175 continue;
176 page = read_cache_page(dir->i_mapping, index,
177 (filler_t *)logfs_readpage, NULL);
178 if (IS_ERR(page))
179 return page;
180 dd = kmap_atomic(page, KM_USER0);
181 BUG_ON(dd->namelen == 0);
182
183 if (name->len != be16_to_cpu(dd->namelen) ||
184 memcmp(name->name, dd->name, name->len)) {
185 kunmap_atomic(dd, KM_USER0);
186 page_cache_release(page);
187 continue;
188 }
189
190 kunmap_atomic(dd, KM_USER0);
191 return page;
192 }
193 return NULL;
194}
195
196static int logfs_remove_inode(struct inode *inode)
197{
198 int ret;
199
200 inode->i_nlink--;
201 ret = write_inode(inode);
202 LOGFS_BUG_ON(ret, inode->i_sb);
203 return ret;
204}
205
206static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
207{
208 if (logfs_inode(inode)->li_block)
209 logfs_inode(inode)->li_block->ta = NULL;
210 kfree(ta);
211}
212
213static int logfs_unlink(struct inode *dir, struct dentry *dentry)
214{
215 struct logfs_super *super = logfs_super(dir->i_sb);
216 struct inode *inode = dentry->d_inode;
217 struct logfs_transaction *ta;
218 struct page *page;
219 pgoff_t index;
220 int ret;
221
222 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
223 if (!ta)
224 return -ENOMEM;
225
226 ta->state = UNLINK_1;
227 ta->ino = inode->i_ino;
228
229 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
230
231 page = logfs_get_dd_page(dir, dentry);
232 if (!page) {
233 kfree(ta);
234 return -ENOENT;
235 }
236 if (IS_ERR(page)) {
237 kfree(ta);
238 return PTR_ERR(page);
239 }
240 index = page->index;
241 page_cache_release(page);
242
243 mutex_lock(&super->s_dirop_mutex);
244 logfs_add_transaction(dir, ta);
245
246 ret = logfs_delete(dir, index, NULL);
247 if (!ret)
248 ret = write_inode(dir);
249
250 if (ret) {
251 abort_transaction(dir, ta);
252 printk(KERN_ERR"LOGFS: unable to delete inode\n");
253 goto out;
254 }
255
256 ta->state = UNLINK_2;
257 logfs_add_transaction(inode, ta);
258 ret = logfs_remove_inode(inode);
259out:
260 mutex_unlock(&super->s_dirop_mutex);
261 return ret;
262}
263
264static inline int logfs_empty_dir(struct inode *dir)
265{
266 u64 data;
267
268 data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits;
269 return data >= i_size_read(dir);
270}
271
272static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
273{
274 struct inode *inode = dentry->d_inode;
275
276 if (!logfs_empty_dir(inode))
277 return -ENOTEMPTY;
278
279 return logfs_unlink(dir, dentry);
280}
281
282/* FIXME: readdir currently has it's own dir_walk code. I don't see a good
283 * way to combine the two copies */
284#define IMPLICIT_NODES 2
285static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
286{
287 struct inode *dir = file->f_dentry->d_inode;
288 loff_t pos = file->f_pos - IMPLICIT_NODES;
289 struct page *page;
290 struct logfs_disk_dentry *dd;
291 int full;
292
293 BUG_ON(pos < 0);
294 for (;; pos++) {
295 if (beyond_eof(dir, pos))
296 break;
297 if (!logfs_exist_block(dir, pos)) {
298 /* deleted dentry */
299 pos = dir_seek_data(dir, pos);
300 continue;
301 }
302 page = read_cache_page(dir->i_mapping, pos,
303 (filler_t *)logfs_readpage, NULL);
304 if (IS_ERR(page))
305 return PTR_ERR(page);
306 dd = kmap(page);
307 BUG_ON(dd->namelen == 0);
308
309 full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
310 pos, be64_to_cpu(dd->ino), dd->type);
311 kunmap(page);
312 page_cache_release(page);
313 if (full)
314 break;
315 }
316
317 file->f_pos = pos + IMPLICIT_NODES;
318 return 0;
319}
320
321static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
322{
323 struct inode *inode = file->f_dentry->d_inode;
324 ino_t pino = parent_ino(file->f_dentry);
325 int err;
326
327 if (file->f_pos < 0)
328 return -EINVAL;
329
330 if (file->f_pos == 0) {
331 if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
332 return 0;
333 file->f_pos++;
334 }
335 if (file->f_pos == 1) {
336 if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
337 return 0;
338 file->f_pos++;
339 }
340
341 err = __logfs_readdir(file, buf, filldir);
342 return err;
343}
344
345static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
346{
347 dd->namelen = cpu_to_be16(name->len);
348 memcpy(dd->name, name->name, name->len);
349}
350
351static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
352 struct nameidata *nd)
353{
354 struct page *page;
355 struct logfs_disk_dentry *dd;
356 pgoff_t index;
357 u64 ino = 0;
358 struct inode *inode;
359
360 page = logfs_get_dd_page(dir, dentry);
361 if (IS_ERR(page))
362 return ERR_CAST(page);
363 if (!page) {
364 d_add(dentry, NULL);
365 return NULL;
366 }
367 index = page->index;
368 dd = kmap_atomic(page, KM_USER0);
369 ino = be64_to_cpu(dd->ino);
370 kunmap_atomic(dd, KM_USER0);
371 page_cache_release(page);
372
373 inode = logfs_iget(dir->i_sb, ino);
374 if (IS_ERR(inode)) {
375 printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n",
376 ino, dir->i_ino, index);
377 return ERR_CAST(inode);
378 }
379 return d_splice_alias(inode, dentry);
380}
381
382static void grow_dir(struct inode *dir, loff_t index)
383{
384 index = (index + 1) << dir->i_sb->s_blocksize_bits;
385 if (i_size_read(dir) < index)
386 i_size_write(dir, index);
387}
388
389static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
390 struct inode *inode)
391{
392 struct page *page;
393 struct logfs_disk_dentry *dd;
394 u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0);
395 pgoff_t index;
396 int round, err;
397
398 for (round = 0; round < 20; round++) {
399 index = hash_index(hash, round);
400
401 if (logfs_exist_block(dir, index))
402 continue;
403 page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL);
404 if (!page)
405 return -ENOMEM;
406
407 dd = kmap_atomic(page, KM_USER0);
408 memset(dd, 0, sizeof(*dd));
409 dd->ino = cpu_to_be64(inode->i_ino);
410 dd->type = logfs_type(inode);
411 logfs_set_name(dd, &dentry->d_name);
412 kunmap_atomic(dd, KM_USER0);
413
414 err = logfs_write_buf(dir, page, WF_LOCK);
415 unlock_page(page);
416 page_cache_release(page);
417 if (!err)
418 grow_dir(dir, index);
419 return err;
420 }
421 /* FIXME: Is there a better return value? In most cases neither
422 * the filesystem nor the directory are full. But we have had
423 * too many collisions for this particular hash and no fallback.
424 */
425 return -ENOSPC;
426}
427
428static int __logfs_create(struct inode *dir, struct dentry *dentry,
429 struct inode *inode, const char *dest, long destlen)
430{
431 struct logfs_super *super = logfs_super(dir->i_sb);
432 struct logfs_inode *li = logfs_inode(inode);
433 struct logfs_transaction *ta;
434 int ret;
435
436 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
437 if (!ta)
438 return -ENOMEM;
439
440 ta->state = CREATE_1;
441 ta->ino = inode->i_ino;
442 mutex_lock(&super->s_dirop_mutex);
443 logfs_add_transaction(inode, ta);
444
445 if (dest) {
446 /* symlink */
447 ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL);
448 if (!ret)
449 ret = write_inode(inode);
450 } else {
451 /* creat/mkdir/mknod */
452 ret = write_inode(inode);
453 }
454 if (ret) {
455 abort_transaction(inode, ta);
456 li->li_flags |= LOGFS_IF_STILLBORN;
457 /* FIXME: truncate symlink */
458 inode->i_nlink--;
459 iput(inode);
460 goto out;
461 }
462
463 ta->state = CREATE_2;
464 logfs_add_transaction(dir, ta);
465 ret = logfs_write_dir(dir, dentry, inode);
466 /* sync directory */
467 if (!ret)
468 ret = write_inode(dir);
469
470 if (ret) {
471 logfs_del_transaction(dir, ta);
472 ta->state = CREATE_2;
473 logfs_add_transaction(inode, ta);
474 logfs_remove_inode(inode);
475 iput(inode);
476 goto out;
477 }
478 d_instantiate(dentry, inode);
479out:
480 mutex_unlock(&super->s_dirop_mutex);
481 return ret;
482}
483
484static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
485{
486 struct inode *inode;
487
488 /*
489 * FIXME: why do we have to fill in S_IFDIR, while the mode is
490 * correct for mknod, creat, etc.? Smells like the vfs *should*
491 * do it for us but for some reason fails to do so.
492 */
493 inode = logfs_new_inode(dir, S_IFDIR | mode);
494 if (IS_ERR(inode))
495 return PTR_ERR(inode);
496
497 inode->i_op = &logfs_dir_iops;
498 inode->i_fop = &logfs_dir_fops;
499
500 return __logfs_create(dir, dentry, inode, NULL, 0);
501}
502
503static int logfs_create(struct inode *dir, struct dentry *dentry, int mode,
504 struct nameidata *nd)
505{
506 struct inode *inode;
507
508 inode = logfs_new_inode(dir, mode);
509 if (IS_ERR(inode))
510 return PTR_ERR(inode);
511
512 inode->i_op = &logfs_reg_iops;
513 inode->i_fop = &logfs_reg_fops;
514 inode->i_mapping->a_ops = &logfs_reg_aops;
515
516 return __logfs_create(dir, dentry, inode, NULL, 0);
517}
518
519static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
520 dev_t rdev)
521{
522 struct inode *inode;
523
524 if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
525 return -ENAMETOOLONG;
526
527 inode = logfs_new_inode(dir, mode);
528 if (IS_ERR(inode))
529 return PTR_ERR(inode);
530
531 init_special_inode(inode, mode, rdev);
532
533 return __logfs_create(dir, dentry, inode, NULL, 0);
534}
535
536static int logfs_symlink(struct inode *dir, struct dentry *dentry,
537 const char *target)
538{
539 struct inode *inode;
540 size_t destlen = strlen(target) + 1;
541
542 if (destlen > dir->i_sb->s_blocksize)
543 return -ENAMETOOLONG;
544
545 inode = logfs_new_inode(dir, S_IFLNK | 0777);
546 if (IS_ERR(inode))
547 return PTR_ERR(inode);
548
549 inode->i_op = &logfs_symlink_iops;
550 inode->i_mapping->a_ops = &logfs_reg_aops;
551
552 return __logfs_create(dir, dentry, inode, target, destlen);
553}
554
555static int logfs_permission(struct inode *inode, int mask)
556{
557 return generic_permission(inode, mask, NULL);
558}
559
560static int logfs_link(struct dentry *old_dentry, struct inode *dir,
561 struct dentry *dentry)
562{
563 struct inode *inode = old_dentry->d_inode;
564
565 if (inode->i_nlink >= LOGFS_LINK_MAX)
566 return -EMLINK;
567
568 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
569 atomic_inc(&inode->i_count);
570 inode->i_nlink++;
571 mark_inode_dirty_sync(inode);
572
573 return __logfs_create(dir, dentry, inode, NULL, 0);
574}
575
576static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
577 struct logfs_disk_dentry *dd, loff_t *pos)
578{
579 struct page *page;
580 void *map;
581
582 page = logfs_get_dd_page(dir, dentry);
583 if (IS_ERR(page))
584 return PTR_ERR(page);
585 *pos = page->index;
586 map = kmap_atomic(page, KM_USER0);
587 memcpy(dd, map, sizeof(*dd));
588 kunmap_atomic(map, KM_USER0);
589 page_cache_release(page);
590 return 0;
591}
592
593static int logfs_delete_dd(struct inode *dir, loff_t pos)
594{
595 /*
596 * Getting called with pos somewhere beyond eof is either a goofup
597 * within this file or means someone maliciously edited the
598 * (crc-protected) journal.
599 */
600 BUG_ON(beyond_eof(dir, pos));
601 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
602 log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos);
603 return logfs_delete(dir, pos, NULL);
604}
605
606/*
607 * Cross-directory rename, target does not exist. Just a little nasty.
608 * Create a new dentry in the target dir, then remove the old dentry,
609 * all the while taking care to remember our operation in the journal.
610 */
611static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
612 struct inode *new_dir, struct dentry *new_dentry)
613{
614 struct logfs_super *super = logfs_super(old_dir->i_sb);
615 struct logfs_disk_dentry dd;
616 struct logfs_transaction *ta;
617 loff_t pos;
618 int err;
619
620 /* 1. locate source dd */
621 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
622 if (err)
623 return err;
624
625 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
626 if (!ta)
627 return -ENOMEM;
628
629 ta->state = CROSS_RENAME_1;
630 ta->dir = old_dir->i_ino;
631 ta->pos = pos;
632
633 /* 2. write target dd */
634 mutex_lock(&super->s_dirop_mutex);
635 logfs_add_transaction(new_dir, ta);
636 err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode);
637 if (!err)
638 err = write_inode(new_dir);
639
640 if (err) {
641 super->s_rename_dir = 0;
642 super->s_rename_pos = 0;
643 abort_transaction(new_dir, ta);
644 goto out;
645 }
646
647 /* 3. remove source dd */
648 ta->state = CROSS_RENAME_2;
649 logfs_add_transaction(old_dir, ta);
650 err = logfs_delete_dd(old_dir, pos);
651 if (!err)
652 err = write_inode(old_dir);
653 LOGFS_BUG_ON(err, old_dir->i_sb);
654out:
655 mutex_unlock(&super->s_dirop_mutex);
656 return err;
657}
658
659static int logfs_replace_inode(struct inode *dir, struct dentry *dentry,
660 struct logfs_disk_dentry *dd, struct inode *inode)
661{
662 loff_t pos;
663 int err;
664
665 err = logfs_get_dd(dir, dentry, dd, &pos);
666 if (err)
667 return err;
668 dd->ino = cpu_to_be64(inode->i_ino);
669 dd->type = logfs_type(inode);
670
671 err = write_dir(dir, dd, pos);
672 if (err)
673 return err;
674 log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos,
675 dd->name, be64_to_cpu(dd->ino));
676 return write_inode(dir);
677}
678
679/* Target dentry exists - the worst case. We need to attach the source
680 * inode to the target dentry, then remove the orphaned target inode and
681 * source dentry.
682 */
683static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
684 struct inode *new_dir, struct dentry *new_dentry)
685{
686 struct logfs_super *super = logfs_super(old_dir->i_sb);
687 struct inode *old_inode = old_dentry->d_inode;
688 struct inode *new_inode = new_dentry->d_inode;
689 int isdir = S_ISDIR(old_inode->i_mode);
690 struct logfs_disk_dentry dd;
691 struct logfs_transaction *ta;
692 loff_t pos;
693 int err;
694
695 BUG_ON(isdir != S_ISDIR(new_inode->i_mode));
696 if (isdir) {
697 if (!logfs_empty_dir(new_inode))
698 return -ENOTEMPTY;
699 }
700
701 /* 1. locate source dd */
702 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
703 if (err)
704 return err;
705
706 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
707 if (!ta)
708 return -ENOMEM;
709
710 ta->state = TARGET_RENAME_1;
711 ta->dir = old_dir->i_ino;
712 ta->pos = pos;
713 ta->ino = new_inode->i_ino;
714
715 /* 2. attach source inode to target dd */
716 mutex_lock(&super->s_dirop_mutex);
717 logfs_add_transaction(new_dir, ta);
718 err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode);
719 if (err) {
720 super->s_rename_dir = 0;
721 super->s_rename_pos = 0;
722 super->s_victim_ino = 0;
723 abort_transaction(new_dir, ta);
724 goto out;
725 }
726
727 /* 3. remove source dd */
728 ta->state = TARGET_RENAME_2;
729 logfs_add_transaction(old_dir, ta);
730 err = logfs_delete_dd(old_dir, pos);
731 if (!err)
732 err = write_inode(old_dir);
733 LOGFS_BUG_ON(err, old_dir->i_sb);
734
735 /* 4. remove target inode */
736 ta->state = TARGET_RENAME_3;
737 logfs_add_transaction(new_inode, ta);
738 err = logfs_remove_inode(new_inode);
739
740out:
741 mutex_unlock(&super->s_dirop_mutex);
742 return err;
743}
744
745static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
746 struct inode *new_dir, struct dentry *new_dentry)
747{
748 if (new_dentry->d_inode)
749 return logfs_rename_target(old_dir, old_dentry,
750 new_dir, new_dentry);
751 return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
752}
753
754/* No locking done here, as this is called before .get_sb() returns. */
755int logfs_replay_journal(struct super_block *sb)
756{
757 struct logfs_super *super = logfs_super(sb);
758 struct inode *inode;
759 u64 ino, pos;
760 int err;
761
762 if (super->s_victim_ino) {
763 /* delete victim inode */
764 ino = super->s_victim_ino;
765 printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino);
766 inode = logfs_iget(sb, ino);
767 if (IS_ERR(inode))
768 goto fail;
769
770 LOGFS_BUG_ON(i_size_read(inode) > 0, sb);
771 super->s_victim_ino = 0;
772 err = logfs_remove_inode(inode);
773 iput(inode);
774 if (err) {
775 super->s_victim_ino = ino;
776 goto fail;
777 }
778 }
779 if (super->s_rename_dir) {
780 /* delete old dd from rename */
781 ino = super->s_rename_dir;
782 pos = super->s_rename_pos;
783 printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n",
784 ino, pos);
785 inode = logfs_iget(sb, ino);
786 if (IS_ERR(inode))
787 goto fail;
788
789 super->s_rename_dir = 0;
790 super->s_rename_pos = 0;
791 err = logfs_delete_dd(inode, pos);
792 iput(inode);
793 if (err) {
794 super->s_rename_dir = ino;
795 super->s_rename_pos = pos;
796 goto fail;
797 }
798 }
799 return 0;
800fail:
801 LOGFS_BUG(sb);
802 return -EIO;
803}
804
805const struct inode_operations logfs_symlink_iops = {
806 .readlink = generic_readlink,
807 .follow_link = page_follow_link_light,
808};
809
810const struct inode_operations logfs_dir_iops = {
811 .create = logfs_create,
812 .link = logfs_link,
813 .lookup = logfs_lookup,
814 .mkdir = logfs_mkdir,
815 .mknod = logfs_mknod,
816 .rename = logfs_rename,
817 .rmdir = logfs_rmdir,
818 .permission = logfs_permission,
819 .symlink = logfs_symlink,
820 .unlink = logfs_unlink,
821};
822const struct file_operations logfs_dir_fops = {
823 .fsync = logfs_fsync,
824 .ioctl = logfs_ioctl,
825 .readdir = logfs_readdir,
826 .read = generic_read_dir,
827};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
new file mode 100644
index 000000000000..370f367a933e
--- /dev/null
+++ b/fs/logfs/file.c
@@ -0,0 +1,263 @@
1/*
2 * fs/logfs/file.c - prepare_write, commit_write and friends
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/sched.h>
10#include <linux/writeback.h>
11
12static int logfs_write_begin(struct file *file, struct address_space *mapping,
13 loff_t pos, unsigned len, unsigned flags,
14 struct page **pagep, void **fsdata)
15{
16 struct inode *inode = mapping->host;
17 struct page *page;
18 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
19
20 page = grab_cache_page_write_begin(mapping, index, flags);
21 if (!page)
22 return -ENOMEM;
23 *pagep = page;
24
25 if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
26 return 0;
27 if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
28 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
29 unsigned end = start + len;
30
31 /* Reading beyond i_size is simple: memset to zero */
32 zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
33 return 0;
34 }
35 return logfs_readpage_nolock(page);
36}
37
38static int logfs_write_end(struct file *file, struct address_space *mapping,
39 loff_t pos, unsigned len, unsigned copied, struct page *page,
40 void *fsdata)
41{
42 struct inode *inode = mapping->host;
43 pgoff_t index = page->index;
44 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
45 unsigned end = start + copied;
46 int ret = 0;
47
48 BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
49 BUG_ON(page->index > I3_BLOCKS);
50
51 if (copied < len) {
52 /*
53 * Short write of a non-initialized paged. Just tell userspace
54 * to retry the entire page.
55 */
56 if (!PageUptodate(page)) {
57 copied = 0;
58 goto out;
59 }
60 }
61 if (copied == 0)
62 goto out; /* FIXME: do we need to update inode? */
63
64 if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
65 i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
66 mark_inode_dirty_sync(inode);
67 }
68
69 SetPageUptodate(page);
70 if (!PageDirty(page)) {
71 if (!get_page_reserve(inode, page))
72 __set_page_dirty_nobuffers(page);
73 else
74 ret = logfs_write_buf(inode, page, WF_LOCK);
75 }
76out:
77 unlock_page(page);
78 page_cache_release(page);
79 return ret ? ret : copied;
80}
81
82int logfs_readpage(struct file *file, struct page *page)
83{
84 int ret;
85
86 ret = logfs_readpage_nolock(page);
87 unlock_page(page);
88 return ret;
89}
90
91/* Clear the page's dirty flag in the radix tree. */
92/* TODO: mucking with PageWriteback is silly. Add a generic function to clear
93 * the dirty bit from the radix tree for filesystems that don't have to wait
94 * for page writeback to finish (i.e. any compressing filesystem).
95 */
96static void clear_radix_tree_dirty(struct page *page)
97{
98 BUG_ON(PagePrivate(page) || page->private);
99 set_page_writeback(page);
100 end_page_writeback(page);
101}
102
103static int __logfs_writepage(struct page *page)
104{
105 struct inode *inode = page->mapping->host;
106 int err;
107
108 err = logfs_write_buf(inode, page, WF_LOCK);
109 if (err)
110 set_page_dirty(page);
111 else
112 clear_radix_tree_dirty(page);
113 unlock_page(page);
114 return err;
115}
116
117static int logfs_writepage(struct page *page, struct writeback_control *wbc)
118{
119 struct inode *inode = page->mapping->host;
120 loff_t i_size = i_size_read(inode);
121 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
122 unsigned offset;
123 u64 bix;
124 level_t level;
125
126 log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
127 page);
128
129 logfs_unpack_index(page->index, &bix, &level);
130
131 /* Indirect blocks are never truncated */
132 if (level != 0)
133 return __logfs_writepage(page);
134
135 /*
136 * TODO: everything below is a near-verbatim copy of nobh_writepage().
137 * The relevant bits should be factored out after logfs is merged.
138 */
139
140 /* Is the page fully inside i_size? */
141 if (bix < end_index)
142 return __logfs_writepage(page);
143
144 /* Is the page fully outside i_size? (truncate in progress) */
145 offset = i_size & (PAGE_CACHE_SIZE-1);
146 if (bix > end_index || offset == 0) {
147 unlock_page(page);
148 return 0; /* don't care */
149 }
150
151 /*
152 * The page straddles i_size. It must be zeroed out on each and every
153 * writepage invokation because it may be mmapped. "A file is mapped
154 * in multiples of the page size. For a file that is not a multiple of
155 * the page size, the remaining memory is zeroed when mapped, and
156 * writes to that region are not written out to the file."
157 */
158 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
159 return __logfs_writepage(page);
160}
161
162static void logfs_invalidatepage(struct page *page, unsigned long offset)
163{
164 move_page_to_btree(page);
165 BUG_ON(PagePrivate(page) || page->private);
166}
167
168static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
169{
170 return 0; /* None of these are easy to release */
171}
172
173
174int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
175 unsigned long arg)
176{
177 struct logfs_inode *li = logfs_inode(inode);
178 unsigned int oldflags, flags;
179 int err;
180
181 switch (cmd) {
182 case FS_IOC_GETFLAGS:
183 flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
184 return put_user(flags, (int __user *)arg);
185 case FS_IOC_SETFLAGS:
186 if (IS_RDONLY(inode))
187 return -EROFS;
188
189 if (!is_owner_or_cap(inode))
190 return -EACCES;
191
192 err = get_user(flags, (int __user *)arg);
193 if (err)
194 return err;
195
196 mutex_lock(&inode->i_mutex);
197 oldflags = li->li_flags;
198 flags &= LOGFS_FL_USER_MODIFIABLE;
199 flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
200 li->li_flags = flags;
201 mutex_unlock(&inode->i_mutex);
202
203 inode->i_ctime = CURRENT_TIME;
204 mark_inode_dirty_sync(inode);
205 return 0;
206
207 default:
208 return -ENOTTY;
209 }
210}
211
212int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
213{
214 struct super_block *sb = dentry->d_inode->i_sb;
215 struct logfs_super *super = logfs_super(sb);
216
217 /* FIXME: write anchor */
218 super->s_devops->sync(sb);
219 return 0;
220}
221
222static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
223{
224 struct inode *inode = dentry->d_inode;
225 int err = 0;
226
227 if (attr->ia_valid & ATTR_SIZE)
228 err = logfs_truncate(inode, attr->ia_size);
229 attr->ia_valid &= ~ATTR_SIZE;
230
231 if (!err)
232 err = inode_change_ok(inode, attr);
233 if (!err)
234 err = inode_setattr(inode, attr);
235 return err;
236}
237
238const struct inode_operations logfs_reg_iops = {
239 .setattr = logfs_setattr,
240};
241
242const struct file_operations logfs_reg_fops = {
243 .aio_read = generic_file_aio_read,
244 .aio_write = generic_file_aio_write,
245 .fsync = logfs_fsync,
246 .ioctl = logfs_ioctl,
247 .llseek = generic_file_llseek,
248 .mmap = generic_file_readonly_mmap,
249 .open = generic_file_open,
250 .read = do_sync_read,
251 .write = do_sync_write,
252};
253
254const struct address_space_operations logfs_reg_aops = {
255 .invalidatepage = logfs_invalidatepage,
256 .readpage = logfs_readpage,
257 .releasepage = logfs_releasepage,
258 .set_page_dirty = __set_page_dirty_nobuffers,
259 .writepage = logfs_writepage,
260 .writepages = generic_writepages,
261 .write_begin = logfs_write_begin,
262 .write_end = logfs_write_end,
263};
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
new file mode 100644
index 000000000000..76c242fbe1b0
--- /dev/null
+++ b/fs/logfs/gc.c
@@ -0,0 +1,739 @@
1/*
2 * fs/logfs/gc.c - garbage collection code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/sched.h>
10#include <linux/slab.h>
11
12/*
13 * Wear leveling needs to kick in when the difference between low erase
14 * counts and high erase counts gets too big. A good value for "too big"
15 * may be somewhat below 10% of maximum erase count for the device.
16 * Why not 397, to pick a nice round number with no specific meaning? :)
17 *
18 * WL_RATELIMIT is the minimum time between two wear level events. A huge
19 * number of segments may fulfil the requirements for wear leveling at the
20 * same time. If that happens we don't want to cause a latency from hell,
21 * but just gently pick one segment every so often and minimize overhead.
22 */
23#define WL_DELTA 397
24#define WL_RATELIMIT 100
25#define MAX_OBJ_ALIASES 2600
26#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */
27#define LIST_SIZE 64 /* base size of candidate lists */
28#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */
29#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
30
31static int no_free_segments(struct super_block *sb)
32{
33 struct logfs_super *super = logfs_super(sb);
34
35 return super->s_free_list.count;
36}
37
38/* journal has distance -1, top-most ifile layer distance 0 */
39static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
40{
41 struct logfs_super *super = logfs_super(sb);
42 u8 gc_level = (__force u8)__gc_level;
43
44 switch (gc_level) {
45 case 0: /* fall through */
46 case 1: /* fall through */
47 case 2: /* fall through */
48 case 3:
49 /* file data or indirect blocks */
50 return super->s_ifile_levels + super->s_iblock_levels - gc_level;
51 case 6: /* fall through */
52 case 7: /* fall through */
53 case 8: /* fall through */
54 case 9:
55 /* inode file data or indirect blocks */
56 return super->s_ifile_levels - (gc_level - 6);
57 default:
58 printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
59 gc_level);
60 WARN_ON(1);
61 return super->s_ifile_levels + super->s_iblock_levels;
62 }
63}
64
65static int segment_is_reserved(struct super_block *sb, u32 segno)
66{
67 struct logfs_super *super = logfs_super(sb);
68 struct logfs_area *area;
69 void *reserved;
70 int i;
71
72 /* Some segments are reserved. Just pretend they were all valid */
73 reserved = btree_lookup32(&super->s_reserved_segments, segno);
74 if (reserved)
75 return 1;
76
77 /* Currently open segments */
78 for_each_area(i) {
79 area = super->s_area[i];
80 if (area->a_is_open && area->a_segno == segno)
81 return 1;
82 }
83
84 return 0;
85}
86
87static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
88{
89 BUG();
90}
91
92/*
93 * Returns the bytes consumed by valid objects in this segment. Object headers
94 * are counted, the segment header is not.
95 */
96static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
97 gc_level_t *gc_level)
98{
99 struct logfs_segment_entry se;
100 u32 ec_level;
101
102 logfs_get_segment_entry(sb, segno, &se);
103 if (se.ec_level == cpu_to_be32(BADSEG) ||
104 se.valid == cpu_to_be32(RESERVED))
105 return RESERVED;
106
107 ec_level = be32_to_cpu(se.ec_level);
108 *ec = ec_level >> 4;
109 *gc_level = GC_LEVEL(ec_level & 0xf);
110 return be32_to_cpu(se.valid);
111}
112
113static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
114 u64 bix, gc_level_t gc_level)
115{
116 struct inode *inode;
117 int err, cookie;
118
119 inode = logfs_safe_iget(sb, ino, &cookie);
120 err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
121 BUG_ON(err);
122 logfs_safe_iput(inode, cookie);
123}
124
125static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist)
126{
127 struct logfs_super *super = logfs_super(sb);
128 struct logfs_segment_header sh;
129 struct logfs_object_header oh;
130 u64 ofs, ino, bix;
131 u32 seg_ofs, logical_segno, cleaned = 0;
132 int err, len, valid;
133 gc_level_t gc_level;
134
135 LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
136
137 btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
138 err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
139 BUG_ON(err);
140 gc_level = GC_LEVEL(sh.level);
141 logical_segno = be32_to_cpu(sh.segno);
142 if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
143 logfs_mark_segment_bad(sb, segno);
144 cleaned = -1;
145 goto out;
146 }
147
148 for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
149 seg_ofs + sizeof(oh) < super->s_segsize; ) {
150 ofs = dev_ofs(sb, logical_segno, seg_ofs);
151 err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
152 &oh);
153 BUG_ON(err);
154
155 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
156 break;
157
158 if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
159 logfs_mark_segment_bad(sb, segno);
160 cleaned = super->s_segsize - 1;
161 goto out;
162 }
163
164 ino = be64_to_cpu(oh.ino);
165 bix = be64_to_cpu(oh.bix);
166 len = sizeof(oh) + be16_to_cpu(oh.len);
167 valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
168 if (valid == 1) {
169 logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
170 cleaned += len;
171 } else if (valid == 2) {
172 /* Will be invalid upon journal commit */
173 cleaned += len;
174 }
175 seg_ofs += len;
176 }
177out:
178 btree_remove32(&super->s_reserved_segments, segno);
179 return cleaned;
180}
181
182static struct gc_candidate *add_list(struct gc_candidate *cand,
183 struct candidate_list *list)
184{
185 struct rb_node **p = &list->rb_tree.rb_node;
186 struct rb_node *parent = NULL;
187 struct gc_candidate *cur;
188 int comp;
189
190 cand->list = list;
191 while (*p) {
192 parent = *p;
193 cur = rb_entry(parent, struct gc_candidate, rb_node);
194
195 if (list->sort_by_ec)
196 comp = cand->erase_count < cur->erase_count;
197 else
198 comp = cand->valid < cur->valid;
199
200 if (comp)
201 p = &parent->rb_left;
202 else
203 p = &parent->rb_right;
204 }
205 rb_link_node(&cand->rb_node, parent, p);
206 rb_insert_color(&cand->rb_node, &list->rb_tree);
207
208 if (list->count <= list->maxcount) {
209 list->count++;
210 return NULL;
211 }
212 cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
213 rb_erase(&cand->rb_node, &list->rb_tree);
214 cand->list = NULL;
215 return cand;
216}
217
218static void remove_from_list(struct gc_candidate *cand)
219{
220 struct candidate_list *list = cand->list;
221
222 rb_erase(&cand->rb_node, &list->rb_tree);
223 list->count--;
224}
225
226static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
227{
228 struct logfs_super *super = logfs_super(sb);
229
230 btree_remove32(&super->s_cand_tree, cand->segno);
231 kfree(cand);
232}
233
234u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
235{
236 struct gc_candidate *cand;
237 u32 segno;
238
239 BUG_ON(list->count == 0);
240
241 cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
242 remove_from_list(cand);
243 segno = cand->segno;
244 if (ec)
245 *ec = cand->erase_count;
246 free_candidate(sb, cand);
247 return segno;
248}
249
250/*
251 * We have several lists to manage segments with. The reserve_list is used to
252 * deal with bad blocks. We try to keep the best (lowest ec) segments on this
253 * list.
254 * The free_list contains free segments for normal usage. It usually gets the
255 * second pick after the reserve_list. But when the free_list is running short
256 * it is more important to keep the free_list full than to keep a reserve.
257 *
258 * Segments that are not free are put onto a per-level low_list. If we have
259 * to run garbage collection, we pick a candidate from there. All segments on
260 * those lists should have at least some free space so GC will make progress.
261 *
262 * And last we have the ec_list, which is used to pick segments for wear
263 * leveling.
264 *
265 * If all appropriate lists are full, we simply free the candidate and forget
266 * about that segment for a while. We have better candidates for each purpose.
267 */
268static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
269{
270 struct logfs_super *super = logfs_super(sb);
271 u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
272
273 if (cand->valid == 0) {
274 /* 100% free segments */
275 log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
276 cand->segno, cand->erase_count,
277 dev_ofs(sb, cand->segno, 0));
278 cand = add_list(cand, &super->s_reserve_list);
279 if (cand) {
280 log_gc_noisy("add free segment %x (ec %x) at %llx\n",
281 cand->segno, cand->erase_count,
282 dev_ofs(sb, cand->segno, 0));
283 cand = add_list(cand, &super->s_free_list);
284 }
285 } else {
286 /* good candidates for Garbage Collection */
287 if (cand->valid < full)
288 cand = add_list(cand, &super->s_low_list[cand->dist]);
289 /* good candidates for wear leveling,
290 * segments that were recently written get ignored */
291 if (cand)
292 cand = add_list(cand, &super->s_ec_list);
293 }
294 if (cand)
295 free_candidate(sb, cand);
296}
297
298static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
299 u8 dist)
300{
301 struct logfs_super *super = logfs_super(sb);
302 struct gc_candidate *cand;
303
304 cand = kmalloc(sizeof(*cand), GFP_NOFS);
305 if (!cand)
306 return -ENOMEM;
307
308 cand->segno = segno;
309 cand->valid = valid;
310 cand->erase_count = ec;
311 cand->dist = dist;
312
313 btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
314 __add_candidate(sb, cand);
315 return 0;
316}
317
318static void remove_segment_from_lists(struct super_block *sb, u32 segno)
319{
320 struct logfs_super *super = logfs_super(sb);
321 struct gc_candidate *cand;
322
323 cand = btree_lookup32(&super->s_cand_tree, segno);
324 if (cand) {
325 remove_from_list(cand);
326 free_candidate(sb, cand);
327 }
328}
329
330static void scan_segment(struct super_block *sb, u32 segno)
331{
332 u32 valid, ec = 0;
333 gc_level_t gc_level = 0;
334 u8 dist;
335
336 if (segment_is_reserved(sb, segno))
337 return;
338
339 remove_segment_from_lists(sb, segno);
340 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
341 if (valid == RESERVED)
342 return;
343
344 dist = root_distance(sb, gc_level);
345 add_candidate(sb, segno, valid, ec, dist);
346}
347
348static struct gc_candidate *first_in_list(struct candidate_list *list)
349{
350 if (list->count == 0)
351 return NULL;
352 return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
353}
354
355/*
356 * Find the best segment for garbage collection. Main criterion is
357 * the segment requiring the least effort to clean. Secondary
358 * criterion is to GC on the lowest level available.
359 *
360 * So we search the least effort segment on the lowest level first,
361 * then move up and pick another segment iff is requires significantly
362 * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
363 */
364static struct gc_candidate *get_candidate(struct super_block *sb)
365{
366 struct logfs_super *super = logfs_super(sb);
367 int i, max_dist;
368 struct gc_candidate *cand = NULL, *this;
369
370 max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
371
372 for (i = max_dist; i >= 0; i--) {
373 this = first_in_list(&super->s_low_list[i]);
374 if (!this)
375 continue;
376 if (!cand)
377 cand = this;
378 if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
379 cand = this;
380 }
381 return cand;
382}
383
384static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
385{
386 struct logfs_super *super = logfs_super(sb);
387 gc_level_t gc_level;
388 u32 cleaned, valid, segno, ec;
389 u8 dist;
390
391 if (!cand) {
392 log_gc("GC attempted, but no candidate found\n");
393 return 0;
394 }
395
396 segno = cand->segno;
397 dist = cand->dist;
398 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
399 free_candidate(sb, cand);
400 log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
401 segno, (u64)segno << super->s_segshift,
402 dist, no_free_segments(sb), valid,
403 super->s_free_bytes);
404 cleaned = logfs_gc_segment(sb, segno, dist);
405 log_gc("GC segment #%02x complete - now %x valid\n", segno,
406 valid - cleaned);
407 BUG_ON(cleaned != valid);
408 return 1;
409}
410
411static int logfs_gc_once(struct super_block *sb)
412{
413 struct gc_candidate *cand;
414
415 cand = get_candidate(sb);
416 if (cand)
417 remove_from_list(cand);
418 return __logfs_gc_once(sb, cand);
419}
420
421/* returns 1 if a wrap occurs, 0 otherwise */
422static int logfs_scan_some(struct super_block *sb)
423{
424 struct logfs_super *super = logfs_super(sb);
425 u32 segno;
426 int i, ret = 0;
427
428 segno = super->s_sweeper;
429 for (i = SCAN_RATIO; i > 0; i--) {
430 segno++;
431 if (segno >= super->s_no_segs) {
432 segno = 0;
433 ret = 1;
434 /* Break out of the loop. We want to read a single
435 * block from the segment size on next invocation if
436 * SCAN_RATIO is set to match block size
437 */
438 break;
439 }
440
441 scan_segment(sb, segno);
442 }
443 super->s_sweeper = segno;
444 return ret;
445}
446
447/*
448 * In principle, this function should loop forever, looking for GC candidates
449 * and moving data. LogFS is designed in such a way that this loop is
450 * guaranteed to terminate.
451 *
452 * Limiting the loop to some iterations serves purely to catch cases when
453 * these guarantees have failed. An actual endless loop is an obvious bug
454 * and should be reported as such.
455 */
456static void __logfs_gc_pass(struct super_block *sb, int target)
457{
458 struct logfs_super *super = logfs_super(sb);
459 struct logfs_block *block;
460 int round, progress, last_progress = 0;
461
462 /*
463 * Doing too many changes to the segfile at once would result
464 * in a large number of aliases. Write the journal before
465 * things get out of hand.
466 */
467 if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
468 logfs_write_anchor(sb);
469
470 if (no_free_segments(sb) >= target &&
471 super->s_no_object_aliases < MAX_OBJ_ALIASES)
472 return;
473
474 log_gc("__logfs_gc_pass(%x)\n", target);
475 for (round = 0; round < SCAN_ROUNDS; ) {
476 if (no_free_segments(sb) >= target)
477 goto write_alias;
478
479 /* Sync in-memory state with on-medium state in case they
480 * diverged */
481 logfs_write_anchor(sb);
482 round += logfs_scan_some(sb);
483 if (no_free_segments(sb) >= target)
484 goto write_alias;
485 progress = logfs_gc_once(sb);
486 if (progress)
487 last_progress = round;
488 else if (round - last_progress > 2)
489 break;
490 continue;
491
492 /*
493 * The goto logic is nasty, I just don't know a better way to
494 * code it. GC is supposed to ensure two things:
495 * 1. Enough free segments are available.
496 * 2. The number of aliases is bounded.
497 * When 1. is achieved, we take a look at 2. and write back
498 * some alias-containing blocks, if necessary. However, after
499 * each such write we need to go back to 1., as writes can
500 * consume free segments.
501 */
502write_alias:
503 if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
504 return;
505 if (list_empty(&super->s_object_alias)) {
506 /* All aliases are still in btree */
507 return;
508 }
509 log_gc("Write back one alias\n");
510 block = list_entry(super->s_object_alias.next,
511 struct logfs_block, alias_list);
512 block->ops->write_block(block);
513 /*
514 * To round off the nasty goto logic, we reset round here. It
515 * is a safety-net for GC not making any progress and limited
516 * to something reasonably small. If incremented it for every
517 * single alias, the loop could terminate rather quickly.
518 */
519 round = 0;
520 }
521 LOGFS_BUG(sb);
522}
523
524static int wl_ratelimit(struct super_block *sb, u64 *next_event)
525{
526 struct logfs_super *super = logfs_super(sb);
527
528 if (*next_event < super->s_gec) {
529 *next_event = super->s_gec + WL_RATELIMIT;
530 return 0;
531 }
532 return 1;
533}
534
535static void logfs_wl_pass(struct super_block *sb)
536{
537 struct logfs_super *super = logfs_super(sb);
538 struct gc_candidate *wl_cand, *free_cand;
539
540 if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
541 return;
542
543 wl_cand = first_in_list(&super->s_ec_list);
544 if (!wl_cand)
545 return;
546 free_cand = first_in_list(&super->s_free_list);
547 if (!free_cand)
548 return;
549
550 if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
551 remove_from_list(wl_cand);
552 __logfs_gc_once(sb, wl_cand);
553 }
554}
555
556/*
557 * The journal needs wear leveling as well. But moving the journal is an
558 * expensive operation so we try to avoid it as much as possible. And if we
559 * have to do it, we move the whole journal, not individual segments.
560 *
561 * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
562 * calculations. First we check whether moving the journal would be a
563 * significant improvement. That means that a) the current journal segments
564 * have more wear than the future journal segments and b) the current journal
565 * segments have more wear than normal ostore segments.
566 * Rationale for b) is that we don't have to move the journal if it is aging
567 * less than the ostore, even if the reserve segments age even less (they are
568 * excluded from wear leveling, after all).
569 * Next we check that the superblocks have less wear than the journal. Since
570 * moving the journal requires writing the superblocks, we have to protect the
571 * superblocks even more than the journal.
572 *
573 * Also we double the acceptable wear difference, compared to ostore wear
574 * leveling. Journal data is read and rewritten rapidly, comparatively. So
575 * soft errors have much less time to accumulate and we allow the journal to
576 * be a bit worse than the ostore.
577 */
578static void logfs_journal_wl_pass(struct super_block *sb)
579{
580 struct logfs_super *super = logfs_super(sb);
581 struct gc_candidate *cand;
582 u32 min_journal_ec = -1, max_reserve_ec = 0;
583 int i;
584
585 if (wl_ratelimit(sb, &super->s_wl_gec_journal))
586 return;
587
588 if (super->s_reserve_list.count < super->s_no_journal_segs) {
589 /* Reserve is not full enough to move complete journal */
590 return;
591 }
592
593 journal_for_each(i)
594 if (super->s_journal_seg[i])
595 min_journal_ec = min(min_journal_ec,
596 super->s_journal_ec[i]);
597 cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
598 struct gc_candidate, rb_node);
599 max_reserve_ec = cand->erase_count;
600 for (i = 0; i < 2; i++) {
601 struct logfs_segment_entry se;
602 u32 segno = seg_no(sb, super->s_sb_ofs[i]);
603 u32 ec;
604
605 logfs_get_segment_entry(sb, segno, &se);
606 ec = be32_to_cpu(se.ec_level) >> 4;
607 max_reserve_ec = max(max_reserve_ec, ec);
608 }
609
610 if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
611 do_logfs_journal_wl_pass(sb);
612 }
613}
614
615void logfs_gc_pass(struct super_block *sb)
616{
617 struct logfs_super *super = logfs_super(sb);
618
619 //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
620 /* Write journal before free space is getting saturated with dirty
621 * objects.
622 */
623 if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
624 + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
625 logfs_write_anchor(sb);
626 __logfs_gc_pass(sb, super->s_total_levels);
627 logfs_wl_pass(sb);
628 logfs_journal_wl_pass(sb);
629}
630
631static int check_area(struct super_block *sb, int i)
632{
633 struct logfs_super *super = logfs_super(sb);
634 struct logfs_area *area = super->s_area[i];
635 struct logfs_object_header oh;
636 u32 segno = area->a_segno;
637 u32 ofs = area->a_used_bytes;
638 __be32 crc;
639 int err;
640
641 if (!area->a_is_open)
642 return 0;
643
644 for (ofs = area->a_used_bytes;
645 ofs <= super->s_segsize - sizeof(oh);
646 ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) {
647 err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh);
648 if (err)
649 return err;
650
651 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
652 break;
653
654 crc = logfs_crc32(&oh, sizeof(oh) - 4, 4);
655 if (crc != oh.crc) {
656 printk(KERN_INFO "interrupted header at %llx\n",
657 dev_ofs(sb, segno, ofs));
658 return 0;
659 }
660 }
661 if (ofs != area->a_used_bytes) {
662 printk(KERN_INFO "%x bytes unaccounted data found at %llx\n",
663 ofs - area->a_used_bytes,
664 dev_ofs(sb, segno, area->a_used_bytes));
665 area->a_used_bytes = ofs;
666 }
667 return 0;
668}
669
670int logfs_check_areas(struct super_block *sb)
671{
672 int i, err;
673
674 for_each_area(i) {
675 err = check_area(sb, i);
676 if (err)
677 return err;
678 }
679 return 0;
680}
681
682static void logfs_init_candlist(struct candidate_list *list, int maxcount,
683 int sort_by_ec)
684{
685 list->count = 0;
686 list->maxcount = maxcount;
687 list->sort_by_ec = sort_by_ec;
688 list->rb_tree = RB_ROOT;
689}
690
691int logfs_init_gc(struct super_block *sb)
692{
693 struct logfs_super *super = logfs_super(sb);
694 int i;
695
696 btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
697 logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
698 logfs_init_candlist(&super->s_reserve_list,
699 super->s_bad_seg_reserve, 1);
700 for_each_area(i)
701 logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
702 logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
703 return 0;
704}
705
706static void logfs_cleanup_list(struct super_block *sb,
707 struct candidate_list *list)
708{
709 struct gc_candidate *cand;
710
711 while (list->count) {
712 cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
713 rb_node);
714 remove_from_list(cand);
715 free_candidate(sb, cand);
716 }
717 BUG_ON(list->rb_tree.rb_node);
718}
719
720void logfs_cleanup_gc(struct super_block *sb)
721{
722 struct logfs_super *super = logfs_super(sb);
723 int i;
724
725 if (!super->s_free_list.count)
726 return;
727
728 /*
729 * FIXME: The btree may still contain a single empty node. So we
730 * call the grim visitor to clean up that mess. Btree code should
731 * do it for us, really.
732 */
733 btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
734 logfs_cleanup_list(sb, &super->s_free_list);
735 logfs_cleanup_list(sb, &super->s_reserve_list);
736 for_each_area(i)
737 logfs_cleanup_list(sb, &super->s_low_list[i]);
738 logfs_cleanup_list(sb, &super->s_ec_list);
739}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
new file mode 100644
index 000000000000..14ed27274da2
--- /dev/null
+++ b/fs/logfs/inode.c
@@ -0,0 +1,418 @@
1/*
2 * fs/logfs/inode.c - inode handling code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/slab.h>
10#include <linux/writeback.h>
11#include <linux/backing-dev.h>
12
13/*
14 * How soon to reuse old inode numbers? LogFS doesn't store deleted inodes
15 * on the medium. It therefore also lacks a method to store the previous
16 * generation number for deleted inodes. Instead a single generation number
17 * is stored which will be used for new inodes. Being just a 32bit counter,
18 * this can obvious wrap relatively quickly. So we only reuse inodes if we
19 * know that a fair number of inodes can be created before we have to increment
20 * the generation again - effectively adding some bits to the counter.
21 * But being too aggressive here means we keep a very large and very sparse
22 * inode file, wasting space on indirect blocks.
23 * So what is a good value? Beats me. 64k seems moderately bad on both
24 * fronts, so let's use that for now...
25 *
26 * NFS sucks, as everyone already knows.
27 */
28#define INOS_PER_WRAP (0x10000)
29
30/*
31 * Logfs' requirement to read inodes for garbage collection makes life a bit
32 * harder. GC may have to read inodes that are in I_FREEING state, when they
33 * are being written out - and waiting for GC to make progress, naturally.
34 *
35 * So we cannot just call iget() or some variant of it, but first have to check
36 * wether the inode in question might be in I_FREEING state. Therefore we
37 * maintain our own per-sb list of "almost deleted" inodes and check against
38 * that list first. Normally this should be at most 1-2 entries long.
39 *
40 * Also, inodes have logfs-specific reference counting on top of what the vfs
41 * does. When .destroy_inode is called, normally the reference count will drop
42 * to zero and the inode gets deleted. But if GC accessed the inode, its
43 * refcount will remain nonzero and final deletion will have to wait.
44 *
45 * As a result we have two sets of functions to get/put inodes:
46 * logfs_safe_iget/logfs_safe_iput - safe to call from GC context
47 * logfs_iget/iput - normal version
48 */
49static struct kmem_cache *logfs_inode_cache;
50
51static DEFINE_SPINLOCK(logfs_inode_lock);
52
53static void logfs_inode_setops(struct inode *inode)
54{
55 switch (inode->i_mode & S_IFMT) {
56 case S_IFDIR:
57 inode->i_op = &logfs_dir_iops;
58 inode->i_fop = &logfs_dir_fops;
59 inode->i_mapping->a_ops = &logfs_reg_aops;
60 break;
61 case S_IFREG:
62 inode->i_op = &logfs_reg_iops;
63 inode->i_fop = &logfs_reg_fops;
64 inode->i_mapping->a_ops = &logfs_reg_aops;
65 break;
66 case S_IFLNK:
67 inode->i_op = &logfs_symlink_iops;
68 inode->i_mapping->a_ops = &logfs_reg_aops;
69 break;
70 case S_IFSOCK: /* fall through */
71 case S_IFBLK: /* fall through */
72 case S_IFCHR: /* fall through */
73 case S_IFIFO:
74 init_special_inode(inode, inode->i_mode, inode->i_rdev);
75 break;
76 default:
77 BUG();
78 }
79}
80
81static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
82{
83 struct inode *inode = iget_locked(sb, ino);
84 int err;
85
86 if (!inode)
87 return ERR_PTR(-ENOMEM);
88 if (!(inode->i_state & I_NEW))
89 return inode;
90
91 err = logfs_read_inode(inode);
92 if (err || inode->i_nlink == 0) {
93 /* inode->i_nlink == 0 can be true when called from
94 * block validator */
95 /* set i_nlink to 0 to prevent caching */
96 inode->i_nlink = 0;
97 logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
98 iget_failed(inode);
99 if (!err)
100 err = -ENOENT;
101 return ERR_PTR(err);
102 }
103
104 logfs_inode_setops(inode);
105 unlock_new_inode(inode);
106 return inode;
107}
108
109struct inode *logfs_iget(struct super_block *sb, ino_t ino)
110{
111 BUG_ON(ino == LOGFS_INO_MASTER);
112 BUG_ON(ino == LOGFS_INO_SEGFILE);
113 return __logfs_iget(sb, ino);
114}
115
116/*
117 * is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
118 * this allows logfs_iput to do the right thing later
119 */
120struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
121{
122 struct logfs_super *super = logfs_super(sb);
123 struct logfs_inode *li;
124
125 if (ino == LOGFS_INO_MASTER)
126 return super->s_master_inode;
127 if (ino == LOGFS_INO_SEGFILE)
128 return super->s_segfile_inode;
129
130 spin_lock(&logfs_inode_lock);
131 list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
132 if (li->vfs_inode.i_ino == ino) {
133 li->li_refcount++;
134 spin_unlock(&logfs_inode_lock);
135 *is_cached = 1;
136 return &li->vfs_inode;
137 }
138 spin_unlock(&logfs_inode_lock);
139
140 *is_cached = 0;
141 return __logfs_iget(sb, ino);
142}
143
144static void __logfs_destroy_inode(struct inode *inode)
145{
146 struct logfs_inode *li = logfs_inode(inode);
147
148 BUG_ON(li->li_block);
149 list_del(&li->li_freeing_list);
150 kmem_cache_free(logfs_inode_cache, li);
151}
152
153static void logfs_destroy_inode(struct inode *inode)
154{
155 struct logfs_inode *li = logfs_inode(inode);
156
157 BUG_ON(list_empty(&li->li_freeing_list));
158 spin_lock(&logfs_inode_lock);
159 li->li_refcount--;
160 if (li->li_refcount == 0)
161 __logfs_destroy_inode(inode);
162 spin_unlock(&logfs_inode_lock);
163}
164
165void logfs_safe_iput(struct inode *inode, int is_cached)
166{
167 if (inode->i_ino == LOGFS_INO_MASTER)
168 return;
169 if (inode->i_ino == LOGFS_INO_SEGFILE)
170 return;
171
172 if (is_cached) {
173 logfs_destroy_inode(inode);
174 return;
175 }
176
177 iput(inode);
178}
179
180static void logfs_init_inode(struct super_block *sb, struct inode *inode)
181{
182 struct logfs_inode *li = logfs_inode(inode);
183 int i;
184
185 li->li_flags = 0;
186 li->li_height = 0;
187 li->li_used_bytes = 0;
188 li->li_block = NULL;
189 inode->i_uid = 0;
190 inode->i_gid = 0;
191 inode->i_size = 0;
192 inode->i_blocks = 0;
193 inode->i_ctime = CURRENT_TIME;
194 inode->i_mtime = CURRENT_TIME;
195 inode->i_nlink = 1;
196 INIT_LIST_HEAD(&li->li_freeing_list);
197
198 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
199 li->li_data[i] = 0;
200
201 return;
202}
203
204static struct inode *logfs_alloc_inode(struct super_block *sb)
205{
206 struct logfs_inode *li;
207
208 li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS);
209 if (!li)
210 return NULL;
211 logfs_init_inode(sb, &li->vfs_inode);
212 return &li->vfs_inode;
213}
214
215/*
216 * In logfs inodes are written to an inode file. The inode file, like any
217 * other file, is managed with a inode. The inode file's inode, aka master
218 * inode, requires special handling in several respects. First, it cannot be
219 * written to the inode file, so it is stored in the journal instead.
220 *
221 * Secondly, this inode cannot be written back and destroyed before all other
222 * inodes have been written. The ordering is important. Linux' VFS is happily
223 * unaware of the ordering constraint and would ordinarily destroy the master
224 * inode at umount time while other inodes are still in use and dirty. Not
225 * good.
226 *
227 * So logfs makes sure the master inode is not written until all other inodes
228 * have been destroyed. Sadly, this method has another side-effect. The VFS
229 * will notice one remaining inode and print a frightening warning message.
230 * Worse, it is impossible to judge whether such a warning was caused by the
231 * master inode or any other inodes have leaked as well.
232 *
233 * Our attempt of solving this is with logfs_new_meta_inode() below. Its
234 * purpose is to create a new inode that will not trigger the warning if such
235 * an inode is still in use. An ugly hack, no doubt. Suggections for
236 * improvement are welcome.
237 */
238struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
239{
240 struct inode *inode;
241
242 inode = logfs_alloc_inode(sb);
243 if (!inode)
244 return ERR_PTR(-ENOMEM);
245
246 inode->i_mode = S_IFREG;
247 inode->i_ino = ino;
248 inode->i_sb = sb;
249
250 /* This is a blatant copy of alloc_inode code. We'd need alloc_inode
251 * to be nonstatic, alas. */
252 {
253 struct address_space * const mapping = &inode->i_data;
254
255 mapping->a_ops = &logfs_reg_aops;
256 mapping->host = inode;
257 mapping->flags = 0;
258 mapping_set_gfp_mask(mapping, GFP_NOFS);
259 mapping->assoc_mapping = NULL;
260 mapping->backing_dev_info = &default_backing_dev_info;
261 inode->i_mapping = mapping;
262 inode->i_nlink = 1;
263 }
264
265 return inode;
266}
267
268struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
269{
270 struct inode *inode;
271 int err;
272
273 inode = logfs_new_meta_inode(sb, ino);
274 if (IS_ERR(inode))
275 return inode;
276
277 err = logfs_read_inode(inode);
278 if (err) {
279 destroy_meta_inode(inode);
280 return ERR_PTR(err);
281 }
282 logfs_inode_setops(inode);
283 return inode;
284}
285
286static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
287{
288 int ret;
289 long flags = WF_LOCK;
290
291 /* Can only happen if creat() failed. Safe to skip. */
292 if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
293 return 0;
294
295 ret = __logfs_write_inode(inode, flags);
296 LOGFS_BUG_ON(ret, inode->i_sb);
297 return ret;
298}
299
300void destroy_meta_inode(struct inode *inode)
301{
302 if (inode) {
303 if (inode->i_data.nrpages)
304 truncate_inode_pages(&inode->i_data, 0);
305 logfs_clear_inode(inode);
306 kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
307 }
308}
309
310/* called with inode_lock held */
311static void logfs_drop_inode(struct inode *inode)
312{
313 struct logfs_super *super = logfs_super(inode->i_sb);
314 struct logfs_inode *li = logfs_inode(inode);
315
316 spin_lock(&logfs_inode_lock);
317 list_move(&li->li_freeing_list, &super->s_freeing_list);
318 spin_unlock(&logfs_inode_lock);
319 generic_drop_inode(inode);
320}
321
322static void logfs_set_ino_generation(struct super_block *sb,
323 struct inode *inode)
324{
325 struct logfs_super *super = logfs_super(sb);
326 u64 ino;
327
328 mutex_lock(&super->s_journal_mutex);
329 ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino);
330 super->s_last_ino = ino;
331 super->s_inos_till_wrap--;
332 if (super->s_inos_till_wrap < 0) {
333 super->s_last_ino = LOGFS_RESERVED_INOS;
334 super->s_generation++;
335 super->s_inos_till_wrap = INOS_PER_WRAP;
336 }
337 inode->i_ino = ino;
338 inode->i_generation = super->s_generation;
339 mutex_unlock(&super->s_journal_mutex);
340}
341
342struct inode *logfs_new_inode(struct inode *dir, int mode)
343{
344 struct super_block *sb = dir->i_sb;
345 struct inode *inode;
346
347 inode = new_inode(sb);
348 if (!inode)
349 return ERR_PTR(-ENOMEM);
350
351 logfs_init_inode(sb, inode);
352
353 /* inherit parent flags */
354 logfs_inode(inode)->li_flags |=
355 logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
356
357 inode->i_mode = mode;
358 logfs_set_ino_generation(sb, inode);
359
360 inode->i_uid = current_fsuid();
361 inode->i_gid = current_fsgid();
362 if (dir->i_mode & S_ISGID) {
363 inode->i_gid = dir->i_gid;
364 if (S_ISDIR(mode))
365 inode->i_mode |= S_ISGID;
366 }
367
368 logfs_inode_setops(inode);
369 insert_inode_hash(inode);
370
371 return inode;
372}
373
374static void logfs_init_once(void *_li)
375{
376 struct logfs_inode *li = _li;
377 int i;
378
379 li->li_flags = 0;
380 li->li_used_bytes = 0;
381 li->li_refcount = 1;
382 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
383 li->li_data[i] = 0;
384 inode_init_once(&li->vfs_inode);
385}
386
387static int logfs_sync_fs(struct super_block *sb, int wait)
388{
389 /* FIXME: write anchor */
390 logfs_super(sb)->s_devops->sync(sb);
391 return 0;
392}
393
394const struct super_operations logfs_super_operations = {
395 .alloc_inode = logfs_alloc_inode,
396 .clear_inode = logfs_clear_inode,
397 .delete_inode = logfs_delete_inode,
398 .destroy_inode = logfs_destroy_inode,
399 .drop_inode = logfs_drop_inode,
400 .write_inode = logfs_write_inode,
401 .statfs = logfs_statfs,
402 .sync_fs = logfs_sync_fs,
403};
404
405int logfs_init_inode_cache(void)
406{
407 logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
408 sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
409 logfs_init_once);
410 if (!logfs_inode_cache)
411 return -ENOMEM;
412 return 0;
413}
414
415void logfs_destroy_inode_cache(void)
416{
417 kmem_cache_destroy(logfs_inode_cache);
418}
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
new file mode 100644
index 000000000000..fb0a613f885b
--- /dev/null
+++ b/fs/logfs/journal.c
@@ -0,0 +1,898 @@
1/*
2 * fs/logfs/journal.c - journal handling code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/slab.h>
10
11static void logfs_calc_free(struct super_block *sb)
12{
13 struct logfs_super *super = logfs_super(sb);
14 u64 reserve, no_segs = super->s_no_segs;
15 s64 free;
16 int i;
17
18 /* superblock segments */
19 no_segs -= 2;
20 super->s_no_journal_segs = 0;
21 /* journal */
22 journal_for_each(i)
23 if (super->s_journal_seg[i]) {
24 no_segs--;
25 super->s_no_journal_segs++;
26 }
27
28 /* open segments plus one extra per level for GC */
29 no_segs -= 2 * super->s_total_levels;
30
31 free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE);
32 free -= super->s_used_bytes;
33 /* just a bit extra */
34 free -= super->s_total_levels * 4096;
35
36 /* Bad blocks are 'paid' for with speed reserve - the filesystem
37 * simply gets slower as bad blocks accumulate. Until the bad blocks
38 * exceed the speed reserve - then the filesystem gets smaller.
39 */
40 reserve = super->s_bad_segments + super->s_bad_seg_reserve;
41 reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE;
42 reserve = max(reserve, super->s_speed_reserve);
43 free -= reserve;
44 if (free < 0)
45 free = 0;
46
47 super->s_free_bytes = free;
48}
49
50static void reserve_sb_and_journal(struct super_block *sb)
51{
52 struct logfs_super *super = logfs_super(sb);
53 struct btree_head32 *head = &super->s_reserved_segments;
54 int i, err;
55
56 err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1,
57 GFP_KERNEL);
58 BUG_ON(err);
59
60 err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1,
61 GFP_KERNEL);
62 BUG_ON(err);
63
64 journal_for_each(i) {
65 if (!super->s_journal_seg[i])
66 continue;
67 err = btree_insert32(head, super->s_journal_seg[i], (void *)1,
68 GFP_KERNEL);
69 BUG_ON(err);
70 }
71}
72
73static void read_dynsb(struct super_block *sb,
74 struct logfs_je_dynsb *dynsb)
75{
76 struct logfs_super *super = logfs_super(sb);
77
78 super->s_gec = be64_to_cpu(dynsb->ds_gec);
79 super->s_sweeper = be64_to_cpu(dynsb->ds_sweeper);
80 super->s_victim_ino = be64_to_cpu(dynsb->ds_victim_ino);
81 super->s_rename_dir = be64_to_cpu(dynsb->ds_rename_dir);
82 super->s_rename_pos = be64_to_cpu(dynsb->ds_rename_pos);
83 super->s_used_bytes = be64_to_cpu(dynsb->ds_used_bytes);
84 super->s_generation = be32_to_cpu(dynsb->ds_generation);
85}
86
87static void read_anchor(struct super_block *sb,
88 struct logfs_je_anchor *da)
89{
90 struct logfs_super *super = logfs_super(sb);
91 struct inode *inode = super->s_master_inode;
92 struct logfs_inode *li = logfs_inode(inode);
93 int i;
94
95 super->s_last_ino = be64_to_cpu(da->da_last_ino);
96 li->li_flags = 0;
97 li->li_height = da->da_height;
98 i_size_write(inode, be64_to_cpu(da->da_size));
99 li->li_used_bytes = be64_to_cpu(da->da_used_bytes);
100
101 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
102 li->li_data[i] = be64_to_cpu(da->da_data[i]);
103}
104
105static void read_erasecount(struct super_block *sb,
106 struct logfs_je_journal_ec *ec)
107{
108 struct logfs_super *super = logfs_super(sb);
109 int i;
110
111 journal_for_each(i)
112 super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]);
113}
114
115static int read_area(struct super_block *sb, struct logfs_je_area *a)
116{
117 struct logfs_super *super = logfs_super(sb);
118 struct logfs_area *area = super->s_area[a->gc_level];
119 u64 ofs;
120 u32 writemask = ~(super->s_writesize - 1);
121
122 if (a->gc_level >= LOGFS_NO_AREAS)
123 return -EIO;
124 if (a->vim != VIM_DEFAULT)
125 return -EIO; /* TODO: close area and continue */
126
127 area->a_used_bytes = be32_to_cpu(a->used_bytes);
128 area->a_written_bytes = area->a_used_bytes & writemask;
129 area->a_segno = be32_to_cpu(a->segno);
130 if (area->a_segno)
131 area->a_is_open = 1;
132
133 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
134 if (super->s_writesize > 1)
135 logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
136 else
137 logfs_buf_recover(area, ofs, NULL, 0);
138 return 0;
139}
140
141static void *unpack(void *from, void *to)
142{
143 struct logfs_journal_header *jh = from;
144 void *data = from + sizeof(struct logfs_journal_header);
145 int err;
146 size_t inlen, outlen;
147
148 inlen = be16_to_cpu(jh->h_len);
149 outlen = be16_to_cpu(jh->h_datalen);
150
151 if (jh->h_compr == COMPR_NONE)
152 memcpy(to, data, inlen);
153 else {
154 err = logfs_uncompress(data, to, inlen, outlen);
155 BUG_ON(err);
156 }
157 return to;
158}
159
160static int __read_je_header(struct super_block *sb, u64 ofs,
161 struct logfs_journal_header *jh)
162{
163 struct logfs_super *super = logfs_super(sb);
164 size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
165 + MAX_JOURNAL_HEADER;
166 u16 type, len, datalen;
167 int err;
168
169 /* read header only */
170 err = wbuf_read(sb, ofs, sizeof(*jh), jh);
171 if (err)
172 return err;
173 type = be16_to_cpu(jh->h_type);
174 len = be16_to_cpu(jh->h_len);
175 datalen = be16_to_cpu(jh->h_datalen);
176 if (len > sb->s_blocksize)
177 return -EIO;
178 if ((type < JE_FIRST) || (type > JE_LAST))
179 return -EIO;
180 if (datalen > bufsize)
181 return -EIO;
182 return 0;
183}
184
185static int __read_je_payload(struct super_block *sb, u64 ofs,
186 struct logfs_journal_header *jh)
187{
188 u16 len;
189 int err;
190
191 len = be16_to_cpu(jh->h_len);
192 err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1);
193 if (err)
194 return err;
195 if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) {
196 /* Old code was confused. It forgot about the header length
197 * and stopped calculating the crc 16 bytes before the end
198 * of data - ick!
199 * FIXME: Remove this hack once the old code is fixed.
200 */
201 if (jh->h_crc == logfs_crc32(jh, len, 4))
202 WARN_ON_ONCE(1);
203 else
204 return -EIO;
205 }
206 return 0;
207}
208
209/*
210 * jh needs to be large enough to hold the complete entry, not just the header
211 */
212static int __read_je(struct super_block *sb, u64 ofs,
213 struct logfs_journal_header *jh)
214{
215 int err;
216
217 err = __read_je_header(sb, ofs, jh);
218 if (err)
219 return err;
220 return __read_je_payload(sb, ofs, jh);
221}
222
223static int read_je(struct super_block *sb, u64 ofs)
224{
225 struct logfs_super *super = logfs_super(sb);
226 struct logfs_journal_header *jh = super->s_compressed_je;
227 void *scratch = super->s_je;
228 u16 type, datalen;
229 int err;
230
231 err = __read_je(sb, ofs, jh);
232 if (err)
233 return err;
234 type = be16_to_cpu(jh->h_type);
235 datalen = be16_to_cpu(jh->h_datalen);
236
237 switch (type) {
238 case JE_DYNSB:
239 read_dynsb(sb, unpack(jh, scratch));
240 break;
241 case JE_ANCHOR:
242 read_anchor(sb, unpack(jh, scratch));
243 break;
244 case JE_ERASECOUNT:
245 read_erasecount(sb, unpack(jh, scratch));
246 break;
247 case JE_AREA:
248 read_area(sb, unpack(jh, scratch));
249 break;
250 case JE_OBJ_ALIAS:
251 err = logfs_load_object_aliases(sb, unpack(jh, scratch),
252 datalen);
253 break;
254 default:
255 WARN_ON_ONCE(1);
256 return -EIO;
257 }
258 return err;
259}
260
261static int logfs_read_segment(struct super_block *sb, u32 segno)
262{
263 struct logfs_super *super = logfs_super(sb);
264 struct logfs_journal_header *jh = super->s_compressed_je;
265 u64 ofs, seg_ofs = dev_ofs(sb, segno, 0);
266 u32 h_ofs, last_ofs = 0;
267 u16 len, datalen, last_len = 0;
268 int i, err;
269
270 /* search for most recent commit */
271 for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) {
272 ofs = seg_ofs + h_ofs;
273 err = __read_je_header(sb, ofs, jh);
274 if (err)
275 continue;
276 if (jh->h_type != cpu_to_be16(JE_COMMIT))
277 continue;
278 err = __read_je_payload(sb, ofs, jh);
279 if (err)
280 continue;
281 len = be16_to_cpu(jh->h_len);
282 datalen = be16_to_cpu(jh->h_datalen);
283 if ((datalen > sizeof(super->s_je_array)) ||
284 (datalen % sizeof(__be64)))
285 continue;
286 last_ofs = h_ofs;
287 last_len = datalen;
288 h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh);
289 }
290 /* read commit */
291 if (last_ofs == 0)
292 return -ENOENT;
293 ofs = seg_ofs + last_ofs;
294 log_journal("Read commit from %llx\n", ofs);
295 err = __read_je(sb, ofs, jh);
296 BUG_ON(err); /* We should have caught it in the scan loop already */
297 if (err)
298 return err;
299 /* uncompress */
300 unpack(jh, super->s_je_array);
301 super->s_no_je = last_len / sizeof(__be64);
302 /* iterate over array */
303 for (i = 0; i < super->s_no_je; i++) {
304 err = read_je(sb, be64_to_cpu(super->s_je_array[i]));
305 if (err)
306 return err;
307 }
308 super->s_journal_area->a_segno = segno;
309 return 0;
310}
311
312static u64 read_gec(struct super_block *sb, u32 segno)
313{
314 struct logfs_segment_header sh;
315 __be32 crc;
316 int err;
317
318 if (!segno)
319 return 0;
320 err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
321 if (err)
322 return 0;
323 crc = logfs_crc32(&sh, sizeof(sh), 4);
324 if (crc != sh.crc) {
325 WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull));
326 /* Most likely it was just erased */
327 return 0;
328 }
329 return be64_to_cpu(sh.gec);
330}
331
332static int logfs_read_journal(struct super_block *sb)
333{
334 struct logfs_super *super = logfs_super(sb);
335 u64 gec[LOGFS_JOURNAL_SEGS], max;
336 u32 segno;
337 int i, max_i;
338
339 max = 0;
340 max_i = -1;
341 journal_for_each(i) {
342 segno = super->s_journal_seg[i];
343 gec[i] = read_gec(sb, super->s_journal_seg[i]);
344 if (gec[i] > max) {
345 max = gec[i];
346 max_i = i;
347 }
348 }
349 if (max_i == -1)
350 return -EIO;
351 /* FIXME: Try older segments in case of error */
352 return logfs_read_segment(sb, super->s_journal_seg[max_i]);
353}
354
355/*
356 * First search the current segment (outer loop), then pick the next segment
357 * in the array, skipping any zero entries (inner loop).
358 */
359static void journal_get_free_segment(struct logfs_area *area)
360{
361 struct logfs_super *super = logfs_super(area->a_sb);
362 int i;
363
364 journal_for_each(i) {
365 if (area->a_segno != super->s_journal_seg[i])
366 continue;
367
368 do {
369 i++;
370 if (i == LOGFS_JOURNAL_SEGS)
371 i = 0;
372 } while (!super->s_journal_seg[i]);
373
374 area->a_segno = super->s_journal_seg[i];
375 area->a_erase_count = ++(super->s_journal_ec[i]);
376 log_journal("Journal now at %x (ec %x)\n", area->a_segno,
377 area->a_erase_count);
378 return;
379 }
380 BUG();
381}
382
383static void journal_get_erase_count(struct logfs_area *area)
384{
385 /* erase count is stored globally and incremented in
386 * journal_get_free_segment() - nothing to do here */
387}
388
389static int journal_erase_segment(struct logfs_area *area)
390{
391 struct super_block *sb = area->a_sb;
392 union {
393 struct logfs_segment_header sh;
394 unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)];
395 } u;
396 u64 ofs;
397 int err;
398
399 err = logfs_erase_segment(sb, area->a_segno, 1);
400 if (err)
401 return err;
402
403 memset(&u, 0, sizeof(u));
404 u.sh.pad = 0;
405 u.sh.type = SEG_JOURNAL;
406 u.sh.level = 0;
407 u.sh.segno = cpu_to_be32(area->a_segno);
408 u.sh.ec = cpu_to_be32(area->a_erase_count);
409 u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
410 u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4);
411
412 /* This causes a bug in segment.c. Not yet. */
413 //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
414
415 ofs = dev_ofs(sb, area->a_segno, 0);
416 area->a_used_bytes = sizeof(u);
417 logfs_buf_write(area, ofs, &u, sizeof(u));
418 return 0;
419}
420
421static size_t __logfs_write_header(struct logfs_super *super,
422 struct logfs_journal_header *jh, size_t len, size_t datalen,
423 u16 type, u8 compr)
424{
425 jh->h_len = cpu_to_be16(len);
426 jh->h_type = cpu_to_be16(type);
427 jh->h_datalen = cpu_to_be16(datalen);
428 jh->h_compr = compr;
429 jh->h_pad[0] = 'H';
430 jh->h_pad[1] = 'E';
431 jh->h_pad[2] = 'A';
432 jh->h_pad[3] = 'D';
433 jh->h_pad[4] = 'R';
434 jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4);
435 return ALIGN(len, 16) + sizeof(*jh);
436}
437
438static size_t logfs_write_header(struct logfs_super *super,
439 struct logfs_journal_header *jh, size_t datalen, u16 type)
440{
441 size_t len = datalen;
442
443 return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE);
444}
445
446static inline size_t logfs_journal_erasecount_size(struct logfs_super *super)
447{
448 return LOGFS_JOURNAL_SEGS * sizeof(__be32);
449}
450
451static void *logfs_write_erasecount(struct super_block *sb, void *_ec,
452 u16 *type, size_t *len)
453{
454 struct logfs_super *super = logfs_super(sb);
455 struct logfs_je_journal_ec *ec = _ec;
456 int i;
457
458 journal_for_each(i)
459 ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]);
460 *type = JE_ERASECOUNT;
461 *len = logfs_journal_erasecount_size(super);
462 return ec;
463}
464
465static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore,
466 size_t ignore2)
467{
468 struct logfs_shadow *shadow = _shadow;
469 struct super_block *sb = (void *)_sb;
470 struct logfs_super *super = logfs_super(sb);
471
472 /* consume new space */
473 super->s_free_bytes -= shadow->new_len;
474 super->s_used_bytes += shadow->new_len;
475 super->s_dirty_used_bytes -= shadow->new_len;
476
477 /* free up old space */
478 super->s_free_bytes += shadow->old_len;
479 super->s_used_bytes -= shadow->old_len;
480 super->s_dirty_free_bytes -= shadow->old_len;
481
482 logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len);
483 logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len);
484
485 log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n",
486 shadow->ino, shadow->bix, shadow->gc_level,
487 shadow->old_ofs, shadow->new_ofs,
488 shadow->old_len, shadow->new_len);
489 mempool_free(shadow, super->s_shadow_pool);
490}
491
492static void account_shadows(struct super_block *sb)
493{
494 struct logfs_super *super = logfs_super(sb);
495 struct inode *inode = super->s_master_inode;
496 struct logfs_inode *li = logfs_inode(inode);
497 struct shadow_tree *tree = &super->s_shadow_tree;
498
499 btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
500 btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
501 btree_grim_visitor32(&tree->segment_map, 0, NULL);
502 tree->no_shadowed_segments = 0;
503
504 if (li->li_block) {
505 /*
506 * We never actually use the structure, when attached to the
507 * master inode. But it is easier to always free it here than
508 * to have checks in several places elsewhere when allocating
509 * it.
510 */
511 li->li_block->ops->free_block(sb, li->li_block);
512 }
513 BUG_ON((s64)li->li_used_bytes < 0);
514}
515
516static void *__logfs_write_anchor(struct super_block *sb, void *_da,
517 u16 *type, size_t *len)
518{
519 struct logfs_super *super = logfs_super(sb);
520 struct logfs_je_anchor *da = _da;
521 struct inode *inode = super->s_master_inode;
522 struct logfs_inode *li = logfs_inode(inode);
523 int i;
524
525 da->da_height = li->li_height;
526 da->da_last_ino = cpu_to_be64(super->s_last_ino);
527 da->da_size = cpu_to_be64(i_size_read(inode));
528 da->da_used_bytes = cpu_to_be64(li->li_used_bytes);
529 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
530 da->da_data[i] = cpu_to_be64(li->li_data[i]);
531 *type = JE_ANCHOR;
532 *len = sizeof(*da);
533 return da;
534}
535
536static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb,
537 u16 *type, size_t *len)
538{
539 struct logfs_super *super = logfs_super(sb);
540 struct logfs_je_dynsb *dynsb = _dynsb;
541
542 dynsb->ds_gec = cpu_to_be64(super->s_gec);
543 dynsb->ds_sweeper = cpu_to_be64(super->s_sweeper);
544 dynsb->ds_victim_ino = cpu_to_be64(super->s_victim_ino);
545 dynsb->ds_rename_dir = cpu_to_be64(super->s_rename_dir);
546 dynsb->ds_rename_pos = cpu_to_be64(super->s_rename_pos);
547 dynsb->ds_used_bytes = cpu_to_be64(super->s_used_bytes);
548 dynsb->ds_generation = cpu_to_be32(super->s_generation);
549 *type = JE_DYNSB;
550 *len = sizeof(*dynsb);
551 return dynsb;
552}
553
554static void write_wbuf(struct super_block *sb, struct logfs_area *area,
555 void *wbuf)
556{
557 struct logfs_super *super = logfs_super(sb);
558 struct address_space *mapping = super->s_mapping_inode->i_mapping;
559 u64 ofs;
560 pgoff_t index;
561 int page_ofs;
562 struct page *page;
563
564 ofs = dev_ofs(sb, area->a_segno,
565 area->a_used_bytes & ~(super->s_writesize - 1));
566 index = ofs >> PAGE_SHIFT;
567 page_ofs = ofs & (PAGE_SIZE - 1);
568
569 page = find_lock_page(mapping, index);
570 BUG_ON(!page);
571 memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
572 unlock_page(page);
573}
574
575static void *logfs_write_area(struct super_block *sb, void *_a,
576 u16 *type, size_t *len)
577{
578 struct logfs_super *super = logfs_super(sb);
579 struct logfs_area *area = super->s_area[super->s_sum_index];
580 struct logfs_je_area *a = _a;
581
582 a->vim = VIM_DEFAULT;
583 a->gc_level = super->s_sum_index;
584 a->used_bytes = cpu_to_be32(area->a_used_bytes);
585 a->segno = cpu_to_be32(area->a_segno);
586 if (super->s_writesize > 1)
587 write_wbuf(sb, area, a + 1);
588
589 *type = JE_AREA;
590 *len = sizeof(*a) + super->s_writesize;
591 return a;
592}
593
594static void *logfs_write_commit(struct super_block *sb, void *h,
595 u16 *type, size_t *len)
596{
597 struct logfs_super *super = logfs_super(sb);
598
599 *type = JE_COMMIT;
600 *len = super->s_no_je * sizeof(__be64);
601 return super->s_je_array;
602}
603
604static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
605 size_t len)
606{
607 struct logfs_super *super = logfs_super(sb);
608 void *header = super->s_compressed_je;
609 void *data = header + sizeof(struct logfs_journal_header);
610 ssize_t compr_len, pad_len;
611 u8 compr = COMPR_ZLIB;
612
613 if (len == 0)
614 return logfs_write_header(super, header, 0, type);
615
616 BUG_ON(len > sb->s_blocksize);
617 compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
618 if (compr_len < 0 || type == JE_ANCHOR) {
619 memcpy(data, buf, len);
620 compr_len = len;
621 compr = COMPR_NONE;
622 }
623
624 pad_len = ALIGN(compr_len, 16);
625 memset(data + compr_len, 0, pad_len - compr_len);
626
627 return __logfs_write_header(super, header, compr_len, len, type, compr);
628}
629
630static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes,
631 int must_pad)
632{
633 u32 writesize = logfs_super(area->a_sb)->s_writesize;
634 s32 ofs;
635 int ret;
636
637 ret = logfs_open_area(area, *bytes);
638 if (ret)
639 return -EAGAIN;
640
641 ofs = area->a_used_bytes;
642 area->a_used_bytes += *bytes;
643
644 if (must_pad) {
645 area->a_used_bytes = ALIGN(area->a_used_bytes, writesize);
646 *bytes = area->a_used_bytes - ofs;
647 }
648
649 return dev_ofs(area->a_sb, area->a_segno, ofs);
650}
651
652static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
653 size_t buf_len)
654{
655 struct logfs_super *super = logfs_super(sb);
656 struct logfs_area *area = super->s_journal_area;
657 struct logfs_journal_header *jh = super->s_compressed_je;
658 size_t len;
659 int must_pad = 0;
660 s64 ofs;
661
662 len = __logfs_write_je(sb, buf, type, buf_len);
663 if (jh->h_type == cpu_to_be16(JE_COMMIT))
664 must_pad = 1;
665
666 ofs = logfs_get_free_bytes(area, &len, must_pad);
667 if (ofs < 0)
668 return ofs;
669 logfs_buf_write(area, ofs, super->s_compressed_je, len);
670 BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES);
671 super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
672 return 0;
673}
674
675static int logfs_write_je(struct super_block *sb,
676 void* (*write)(struct super_block *sb, void *scratch,
677 u16 *type, size_t *len))
678{
679 void *buf;
680 size_t len;
681 u16 type;
682
683 buf = write(sb, logfs_super(sb)->s_je, &type, &len);
684 return logfs_write_je_buf(sb, buf, type, len);
685}
686
687int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
688 level_t level, int child_no, __be64 val)
689{
690 struct logfs_super *super = logfs_super(sb);
691 struct logfs_obj_alias *oa = super->s_je;
692 int err = 0, fill = super->s_je_fill;
693
694 log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n",
695 fill, ino, bix, level, child_no, be64_to_cpu(val));
696 oa[fill].ino = cpu_to_be64(ino);
697 oa[fill].bix = cpu_to_be64(bix);
698 oa[fill].val = val;
699 oa[fill].level = (__force u8)level;
700 oa[fill].child_no = cpu_to_be16(child_no);
701 fill++;
702 if (fill >= sb->s_blocksize / sizeof(*oa)) {
703 err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize);
704 fill = 0;
705 }
706
707 super->s_je_fill = fill;
708 return err;
709}
710
711static int logfs_write_obj_aliases(struct super_block *sb)
712{
713 struct logfs_super *super = logfs_super(sb);
714 int err;
715
716 log_journal("logfs_write_obj_aliases: %d aliases to write\n",
717 super->s_no_object_aliases);
718 super->s_je_fill = 0;
719 err = logfs_write_obj_aliases_pagecache(sb);
720 if (err)
721 return err;
722
723 if (super->s_je_fill)
724 err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS,
725 super->s_je_fill
726 * sizeof(struct logfs_obj_alias));
727 return err;
728}
729
730/*
731 * Write all journal entries. The goto logic ensures that all journal entries
732 * are written whenever a new segment is used. It is ugly and potentially a
733 * bit wasteful, but robustness is more important. With this we can *always*
734 * erase all journal segments except the one containing the most recent commit.
735 */
736void logfs_write_anchor(struct super_block *sb)
737{
738 struct logfs_super *super = logfs_super(sb);
739 struct logfs_area *area = super->s_journal_area;
740 int i, err;
741
742 if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY))
743 return;
744 super->s_flags &= ~LOGFS_SB_FLAG_DIRTY;
745
746 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
747 mutex_lock(&super->s_journal_mutex);
748
749 /* Do this first or suffer corruption */
750 logfs_sync_segments(sb);
751 account_shadows(sb);
752
753again:
754 super->s_no_je = 0;
755 for_each_area(i) {
756 if (!super->s_area[i]->a_is_open)
757 continue;
758 super->s_sum_index = i;
759 err = logfs_write_je(sb, logfs_write_area);
760 if (err)
761 goto again;
762 }
763 err = logfs_write_obj_aliases(sb);
764 if (err)
765 goto again;
766 err = logfs_write_je(sb, logfs_write_erasecount);
767 if (err)
768 goto again;
769 err = logfs_write_je(sb, __logfs_write_anchor);
770 if (err)
771 goto again;
772 err = logfs_write_je(sb, logfs_write_dynsb);
773 if (err)
774 goto again;
775 /*
776 * Order is imperative. First we sync all writes, including the
777 * non-committed journal writes. Then we write the final commit and
778 * sync the current journal segment.
779 * There is a theoretical bug here. Syncing the journal segment will
780 * write a number of journal entries and the final commit. All these
781 * are written in a single operation. If the device layer writes the
782 * data back-to-front, the commit will precede the other journal
783 * entries, leaving a race window.
784 * Two fixes are possible. Preferred is to fix the device layer to
785 * ensure writes happen front-to-back. Alternatively we can insert
786 * another logfs_sync_area() super->s_devops->sync() combo before
787 * writing the commit.
788 */
789 /*
790 * On another subject, super->s_devops->sync is usually not necessary.
791 * Unless called from sys_sync or friends, a barrier would suffice.
792 */
793 super->s_devops->sync(sb);
794 err = logfs_write_je(sb, logfs_write_commit);
795 if (err)
796 goto again;
797 log_journal("Write commit to %llx\n",
798 be64_to_cpu(super->s_je_array[super->s_no_je - 1]));
799 logfs_sync_area(area);
800 BUG_ON(area->a_used_bytes != area->a_written_bytes);
801 super->s_devops->sync(sb);
802
803 mutex_unlock(&super->s_journal_mutex);
804 return;
805}
806
807void do_logfs_journal_wl_pass(struct super_block *sb)
808{
809 struct logfs_super *super = logfs_super(sb);
810 struct logfs_area *area = super->s_journal_area;
811 struct btree_head32 *head = &super->s_reserved_segments;
812 u32 segno, ec;
813 int i, err;
814
815 log_journal("Journal requires wear-leveling.\n");
816 /* Drop old segments */
817 journal_for_each(i)
818 if (super->s_journal_seg[i]) {
819 btree_remove32(head, super->s_journal_seg[i]);
820 logfs_set_segment_unreserved(sb,
821 super->s_journal_seg[i],
822 super->s_journal_ec[i]);
823 super->s_journal_seg[i] = 0;
824 super->s_journal_ec[i] = 0;
825 }
826 /* Get new segments */
827 for (i = 0; i < super->s_no_journal_segs; i++) {
828 segno = get_best_cand(sb, &super->s_reserve_list, &ec);
829 super->s_journal_seg[i] = segno;
830 super->s_journal_ec[i] = ec;
831 logfs_set_segment_reserved(sb, segno);
832 err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
833 BUG_ON(err); /* mempool should prevent this */
834 err = logfs_erase_segment(sb, segno, 1);
835 BUG_ON(err); /* FIXME: remount-ro would be nicer */
836 }
837 /* Manually move journal_area */
838 freeseg(sb, area->a_segno);
839 area->a_segno = super->s_journal_seg[0];
840 area->a_is_open = 0;
841 area->a_used_bytes = 0;
842 /* Write journal */
843 logfs_write_anchor(sb);
844 /* Write superblocks */
845 err = logfs_write_sb(sb);
846 BUG_ON(err);
847}
848
849static const struct logfs_area_ops journal_area_ops = {
850 .get_free_segment = journal_get_free_segment,
851 .get_erase_count = journal_get_erase_count,
852 .erase_segment = journal_erase_segment,
853};
854
855int logfs_init_journal(struct super_block *sb)
856{
857 struct logfs_super *super = logfs_super(sb);
858 size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
859 + MAX_JOURNAL_HEADER;
860 int ret = -ENOMEM;
861
862 mutex_init(&super->s_journal_mutex);
863 btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool);
864
865 super->s_je = kzalloc(bufsize, GFP_KERNEL);
866 if (!super->s_je)
867 return ret;
868
869 super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL);
870 if (!super->s_compressed_je)
871 return ret;
872
873 super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER);
874 if (IS_ERR(super->s_master_inode))
875 return PTR_ERR(super->s_master_inode);
876
877 ret = logfs_read_journal(sb);
878 if (ret)
879 return -EIO;
880
881 reserve_sb_and_journal(sb);
882 logfs_calc_free(sb);
883
884 super->s_journal_area->a_ops = &journal_area_ops;
885 return 0;
886}
887
888void logfs_cleanup_journal(struct super_block *sb)
889{
890 struct logfs_super *super = logfs_super(sb);
891
892 btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
893 destroy_meta_inode(super->s_master_inode);
894 super->s_master_inode = NULL;
895
896 kfree(super->s_compressed_je);
897 kfree(super->s_je);
898}
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
new file mode 100644
index 000000000000..0a3df1a0c936
--- /dev/null
+++ b/fs/logfs/logfs.h
@@ -0,0 +1,736 @@
1/*
2 * fs/logfs/logfs.h
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Private header for logfs.
9 */
10#ifndef FS_LOGFS_LOGFS_H
11#define FS_LOGFS_LOGFS_H
12
13#undef __CHECK_ENDIAN__
14#define __CHECK_ENDIAN__
15
16#include <linux/btree.h>
17#include <linux/crc32.h>
18#include <linux/fs.h>
19#include <linux/kernel.h>
20#include <linux/mempool.h>
21#include <linux/pagemap.h>
22#include <linux/mtd/mtd.h>
23#include "logfs_abi.h"
24
25#define LOGFS_DEBUG_SUPER (0x0001)
26#define LOGFS_DEBUG_SEGMENT (0x0002)
27#define LOGFS_DEBUG_JOURNAL (0x0004)
28#define LOGFS_DEBUG_DIR (0x0008)
29#define LOGFS_DEBUG_FILE (0x0010)
30#define LOGFS_DEBUG_INODE (0x0020)
31#define LOGFS_DEBUG_READWRITE (0x0040)
32#define LOGFS_DEBUG_GC (0x0080)
33#define LOGFS_DEBUG_GC_NOISY (0x0100)
34#define LOGFS_DEBUG_ALIASES (0x0200)
35#define LOGFS_DEBUG_BLOCKMOVE (0x0400)
36#define LOGFS_DEBUG_ALL (0xffffffff)
37
38#define LOGFS_DEBUG (0x01)
39/*
40 * To enable specific log messages, simply define LOGFS_DEBUG to match any
41 * or all of the above.
42 */
43#ifndef LOGFS_DEBUG
44#define LOGFS_DEBUG (0)
45#endif
46
47#define log_cond(cond, fmt, arg...) do { \
48 if (cond) \
49 printk(KERN_DEBUG fmt, ##arg); \
50} while (0)
51
52#define log_super(fmt, arg...) \
53 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg)
54#define log_segment(fmt, arg...) \
55 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg)
56#define log_journal(fmt, arg...) \
57 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg)
58#define log_dir(fmt, arg...) \
59 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg)
60#define log_file(fmt, arg...) \
61 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg)
62#define log_inode(fmt, arg...) \
63 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg)
64#define log_readwrite(fmt, arg...) \
65 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg)
66#define log_gc(fmt, arg...) \
67 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg)
68#define log_gc_noisy(fmt, arg...) \
69 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg)
70#define log_aliases(fmt, arg...) \
71 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg)
72#define log_blockmove(fmt, arg...) \
73 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg)
74
75#define PG_pre_locked PG_owner_priv_1
76#define PagePreLocked(page) test_bit(PG_pre_locked, &(page)->flags)
77#define SetPagePreLocked(page) set_bit(PG_pre_locked, &(page)->flags)
78#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags)
79
80/* FIXME: This should really be somewhere in the 64bit area. */
81#define LOGFS_LINK_MAX (1<<30)
82
83/* Read-only filesystem */
84#define LOGFS_SB_FLAG_RO 0x0001
85#define LOGFS_SB_FLAG_DIRTY 0x0002
86#define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004
87#define LOGFS_SB_FLAG_SHUTDOWN 0x0008
88
89/* Write Control Flags */
90#define WF_LOCK 0x01 /* take write lock */
91#define WF_WRITE 0x02 /* write block */
92#define WF_DELETE 0x04 /* delete old block */
93
94typedef u8 __bitwise level_t;
95typedef u8 __bitwise gc_level_t;
96
97#define LEVEL(level) ((__force level_t)(level))
98#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level))
99
100#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)), \
101 (__force level_t)((__force u8)(level) - 1) )
102
103/**
104 * struct logfs_area - area management information
105 *
106 * @a_sb: the superblock this area belongs to
107 * @a_is_open: 1 if the area is currently open, else 0
108 * @a_segno: segment number of area
109 * @a_written_bytes: number of bytes already written back
110 * @a_used_bytes: number of used bytes
111 * @a_ops: area operations (either journal or ostore)
112 * @a_erase_count: erase count
113 * @a_level: GC level
114 */
115struct logfs_area { /* a segment open for writing */
116 struct super_block *a_sb;
117 int a_is_open;
118 u32 a_segno;
119 u32 a_written_bytes;
120 u32 a_used_bytes;
121 const struct logfs_area_ops *a_ops;
122 u32 a_erase_count;
123 gc_level_t a_level;
124};
125
126/**
127 * struct logfs_area_ops - area operations
128 *
129 * @get_free_segment: fill area->ofs with the offset of a free segment
130 * @get_erase_count: fill area->erase_count (needs area->ofs)
131 * @erase_segment: erase and setup segment
132 */
133struct logfs_area_ops {
134 void (*get_free_segment)(struct logfs_area *area);
135 void (*get_erase_count)(struct logfs_area *area);
136 int (*erase_segment)(struct logfs_area *area);
137};
138
139/**
140 * struct logfs_device_ops - device access operations
141 *
142 * @readpage: read one page (mm page)
143 * @writeseg: write one segment. may be a partial segment
144 * @erase: erase one segment
145 * @read: read from the device
146 * @erase: erase part of the device
147 */
148struct logfs_device_ops {
149 struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
150 struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs);
151 int (*write_sb)(struct super_block *sb, struct page *page);
152 int (*readpage)(void *_sb, struct page *page);
153 void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
154 int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
155 int ensure_write);
156 void (*sync)(struct super_block *sb);
157 void (*put_device)(struct super_block *sb);
158};
159
160/**
161 * struct candidate_list - list of similar candidates
162 */
163struct candidate_list {
164 struct rb_root rb_tree;
165 int count;
166 int maxcount;
167 int sort_by_ec;
168};
169
170/**
171 * struct gc_candidate - "candidate" segment to be garbage collected next
172 *
173 * @list: list (either free of low)
174 * @segno: segment number
175 * @valid: number of valid bytes
176 * @erase_count: erase count of segment
177 * @dist: distance from tree root
178 *
179 * Candidates can be on two lists. The free list contains electees rather
180 * than candidates - segments that no longer contain any valid data. The
181 * low list contains candidates to be picked for GC. It should be kept
182 * short. It is not required to always pick a perfect candidate. In the
183 * worst case GC will have to move more data than absolutely necessary.
184 */
185struct gc_candidate {
186 struct rb_node rb_node;
187 struct candidate_list *list;
188 u32 segno;
189 u32 valid;
190 u32 erase_count;
191 u8 dist;
192};
193
194/**
195 * struct logfs_journal_entry - temporary structure used during journal scan
196 *
197 * @used:
198 * @version: normalized version
199 * @len: length
200 * @offset: offset
201 */
202struct logfs_journal_entry {
203 int used;
204 s16 version;
205 u16 len;
206 u16 datalen;
207 u64 offset;
208};
209
210enum transaction_state {
211 CREATE_1 = 1,
212 CREATE_2,
213 UNLINK_1,
214 UNLINK_2,
215 CROSS_RENAME_1,
216 CROSS_RENAME_2,
217 TARGET_RENAME_1,
218 TARGET_RENAME_2,
219 TARGET_RENAME_3
220};
221
222/**
223 * struct logfs_transaction - essential fields to support atomic dirops
224 *
225 * @ino: target inode
226 * @dir: inode of directory containing dentry
227 * @pos: pos of dentry in directory
228 */
229struct logfs_transaction {
230 enum transaction_state state;
231 u64 ino;
232 u64 dir;
233 u64 pos;
234};
235
236/**
237 * struct logfs_shadow - old block in the shadow of a not-yet-committed new one
238 * @old_ofs: offset of old block on medium
239 * @new_ofs: offset of new block on medium
240 * @ino: inode number
241 * @bix: block index
242 * @old_len: size of old block, including header
243 * @new_len: size of new block, including header
244 * @level: block level
245 */
246struct logfs_shadow {
247 u64 old_ofs;
248 u64 new_ofs;
249 u64 ino;
250 u64 bix;
251 int old_len;
252 int new_len;
253 gc_level_t gc_level;
254};
255
256/**
257 * struct shadow_tree
258 * @new: shadows where old_ofs==0, indexed by new_ofs
259 * @old: shadows where old_ofs!=0, indexed by old_ofs
260 * @segment_map: bitfield of segments containing shadows
261 * @no_shadowed_segment: number of segments containing shadows
262 */
263struct shadow_tree {
264 struct btree_head64 new;
265 struct btree_head64 old;
266 struct btree_head32 segment_map;
267 int no_shadowed_segments;
268};
269
270struct object_alias_item {
271 struct list_head list;
272 __be64 val;
273 int child_no;
274};
275
276/**
277 * struct logfs_block - contains any block state
278 * @type: indirect block or inode
279 * @full: number of fully populated children
280 * @partial: number of partially populated children
281 *
282 * Most blocks are directly represented by page cache pages. But when a block
283 * becomes dirty, is part of a transaction, contains aliases or is otherwise
284 * special, a struct logfs_block is allocated to track the additional state.
285 * Inodes are very similar to indirect blocks, so they can also get one of
286 * these structures added when appropriate.
287 */
288#define BLOCK_INDIRECT 1 /* Indirect block */
289#define BLOCK_INODE 2 /* Inode */
290struct logfs_block_ops;
291struct logfs_block {
292 struct list_head alias_list;
293 struct list_head item_list;
294 struct super_block *sb;
295 u64 ino;
296 u64 bix;
297 level_t level;
298 struct page *page;
299 struct inode *inode;
300 struct logfs_transaction *ta;
301 unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
302 struct logfs_block_ops *ops;
303 int full;
304 int partial;
305 int reserved_bytes;
306};
307
308typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
309 level_t level, int child_no, __be64 val);
310struct logfs_block_ops {
311 void (*write_block)(struct logfs_block *block);
312 void (*free_block)(struct super_block *sb, struct logfs_block*block);
313 int (*write_alias)(struct super_block *sb,
314 struct logfs_block *block,
315 write_alias_t *write_one_alias);
316};
317
318#define MAX_JOURNAL_ENTRIES 256
319
320struct logfs_super {
321 struct mtd_info *s_mtd; /* underlying device */
322 struct block_device *s_bdev; /* underlying device */
323 const struct logfs_device_ops *s_devops;/* device access */
324 struct inode *s_master_inode; /* inode file */
325 struct inode *s_segfile_inode; /* segment file */
326 struct inode *s_mapping_inode; /* device mapping */
327 atomic_t s_pending_writes; /* outstanting bios */
328 long s_flags;
329 mempool_t *s_btree_pool; /* for btree nodes */
330 mempool_t *s_alias_pool; /* aliases in segment.c */
331 u64 s_feature_incompat;
332 u64 s_feature_ro_compat;
333 u64 s_feature_compat;
334 u64 s_feature_flags;
335 u64 s_sb_ofs[2];
336 struct page *s_erase_page; /* for dev_bdev.c */
337 /* alias.c fields */
338 struct btree_head32 s_segment_alias; /* remapped segments */
339 int s_no_object_aliases;
340 struct list_head s_object_alias; /* remapped objects */
341 struct btree_head128 s_object_alias_tree; /* remapped objects */
342 struct mutex s_object_alias_mutex;
343 /* dir.c fields */
344 struct mutex s_dirop_mutex; /* for creat/unlink/rename */
345 u64 s_victim_ino; /* used for atomic dir-ops */
346 u64 s_rename_dir; /* source directory ino */
347 u64 s_rename_pos; /* position of source dd */
348 /* gc.c fields */
349 long s_segsize; /* size of a segment */
350 int s_segshift; /* log2 of segment size */
351 long s_segmask; /* 1 << s_segshift - 1 */
352 long s_no_segs; /* segments on device */
353 long s_no_journal_segs; /* segments used for journal */
354 long s_no_blocks; /* blocks per segment */
355 long s_writesize; /* minimum write size */
356 int s_writeshift; /* log2 of write size */
357 u64 s_size; /* filesystem size */
358 struct logfs_area *s_area[LOGFS_NO_AREAS]; /* open segment array */
359 u64 s_gec; /* global erase count */
360 u64 s_wl_gec_ostore; /* time of last wl event */
361 u64 s_wl_gec_journal; /* time of last wl event */
362 u64 s_sweeper; /* current sweeper pos */
363 u8 s_ifile_levels; /* max level of ifile */
364 u8 s_iblock_levels; /* max level of regular files */
365 u8 s_data_levels; /* # of segments to leaf block*/
366 u8 s_total_levels; /* sum of above three */
367 struct btree_head32 s_cand_tree; /* all candidates */
368 struct candidate_list s_free_list; /* 100% free segments */
369 struct candidate_list s_reserve_list; /* Bad segment reserve */
370 struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */
371 struct candidate_list s_ec_list; /* wear level candidates */
372 struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */
373 /* inode.c fields */
374 u64 s_last_ino; /* highest ino used */
375 long s_inos_till_wrap;
376 u32 s_generation; /* i_generation for new files */
377 struct list_head s_freeing_list; /* inodes being freed */
378 /* journal.c fields */
379 struct mutex s_journal_mutex;
380 void *s_je; /* journal entry to compress */
381 void *s_compressed_je; /* block to write to journal */
382 u32 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */
383 u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
384 u64 s_last_version;
385 struct logfs_area *s_journal_area; /* open journal segment */
386 __be64 s_je_array[MAX_JOURNAL_ENTRIES];
387 int s_no_je;
388
389 int s_sum_index; /* for the 12 summaries */
390 struct shadow_tree s_shadow_tree;
391 int s_je_fill; /* index of current je */
392 /* readwrite.c fields */
393 struct mutex s_write_mutex;
394 int s_lock_count;
395 mempool_t *s_block_pool; /* struct logfs_block pool */
396 mempool_t *s_shadow_pool; /* struct logfs_shadow pool */
397 /*
398 * Space accounting:
399 * - s_used_bytes specifies space used to store valid data objects.
400 * - s_dirty_used_bytes is space used to store non-committed data
401 * objects. Those objects have already been written themselves,
402 * but they don't become valid until all indirect blocks up to the
403 * journal have been written as well.
404 * - s_dirty_free_bytes is space used to store the old copy of a
405 * replaced object, as long as the replacement is non-committed.
406 * In other words, it is the amount of space freed when all dirty
407 * blocks are written back.
408 * - s_free_bytes is the amount of free space available for any
409 * purpose.
410 * - s_root_reserve is the amount of free space available only to
411 * the root user. Non-privileged users can no longer write once
412 * this watermark has been reached.
413 * - s_speed_reserve is space which remains unused to speed up
414 * garbage collection performance.
415 * - s_dirty_pages is the space reserved for currently dirty pages.
416 * It is a pessimistic estimate, so some/most will get freed on
417 * page writeback.
418 *
419 * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size
420 */
421 u64 s_free_bytes;
422 u64 s_used_bytes;
423 u64 s_dirty_free_bytes;
424 u64 s_dirty_used_bytes;
425 u64 s_root_reserve;
426 u64 s_speed_reserve;
427 u64 s_dirty_pages;
428 /* Bad block handling:
429 * - s_bad_seg_reserve is a number of segments usually kept
430 * free. When encountering bad blocks, the affected segment's data
431 * is _temporarily_ moved to a reserved segment.
432 * - s_bad_segments is the number of known bad segments.
433 */
434 u32 s_bad_seg_reserve;
435 u32 s_bad_segments;
436};
437
438/**
439 * struct logfs_inode - in-memory inode
440 *
441 * @vfs_inode: struct inode
442 * @li_data: data pointers
443 * @li_used_bytes: number of used bytes
444 * @li_freeing_list: used to track inodes currently being freed
445 * @li_flags: inode flags
446 * @li_refcount: number of internal (GC-induced) references
447 */
448struct logfs_inode {
449 struct inode vfs_inode;
450 u64 li_data[LOGFS_EMBEDDED_FIELDS];
451 u64 li_used_bytes;
452 struct list_head li_freeing_list;
453 struct logfs_block *li_block;
454 u32 li_flags;
455 u8 li_height;
456 int li_refcount;
457};
458
459#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++)
460#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++)
461#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--)
462
463/* compr.c */
464int logfs_compress(void *in, void *out, size_t inlen, size_t outlen);
465int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen);
466int __init logfs_compr_init(void);
467void logfs_compr_exit(void);
468
469/* dev_bdev.c */
470#ifdef CONFIG_BLOCK
471int logfs_get_sb_bdev(struct file_system_type *type, int flags,
472 const char *devname, struct vfsmount *mnt);
473#else
474static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
475 const char *devname, struct vfsmount *mnt)
476{
477 return -ENODEV;
478}
479#endif
480
481/* dev_mtd.c */
482#ifdef CONFIG_MTD
483int logfs_get_sb_mtd(struct file_system_type *type, int flags,
484 int mtdnr, struct vfsmount *mnt);
485#else
486static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags,
487 int mtdnr, struct vfsmount *mnt)
488{
489 return -ENODEV;
490}
491#endif
492
493/* dir.c */
494extern const struct inode_operations logfs_symlink_iops;
495extern const struct inode_operations logfs_dir_iops;
496extern const struct file_operations logfs_dir_fops;
497int logfs_replay_journal(struct super_block *sb);
498
499/* file.c */
500extern const struct inode_operations logfs_reg_iops;
501extern const struct file_operations logfs_reg_fops;
502extern const struct address_space_operations logfs_reg_aops;
503int logfs_readpage(struct file *file, struct page *page);
504int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
505 unsigned long arg);
506int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
507
508/* gc.c */
509u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
510void logfs_gc_pass(struct super_block *sb);
511int logfs_check_areas(struct super_block *sb);
512int logfs_init_gc(struct super_block *sb);
513void logfs_cleanup_gc(struct super_block *sb);
514
515/* inode.c */
516extern const struct super_operations logfs_super_operations;
517struct inode *logfs_iget(struct super_block *sb, ino_t ino);
518struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie);
519void logfs_safe_iput(struct inode *inode, int cookie);
520struct inode *logfs_new_inode(struct inode *dir, int mode);
521struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
522struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
523int logfs_init_inode_cache(void);
524void logfs_destroy_inode_cache(void);
525void destroy_meta_inode(struct inode *inode);
526void logfs_set_blocks(struct inode *inode, u64 no);
527/* these logically belong into inode.c but actually reside in readwrite.c */
528int logfs_read_inode(struct inode *inode);
529int __logfs_write_inode(struct inode *inode, long flags);
530void logfs_delete_inode(struct inode *inode);
531void logfs_clear_inode(struct inode *inode);
532
533/* journal.c */
534void logfs_write_anchor(struct super_block *sb);
535int logfs_init_journal(struct super_block *sb);
536void logfs_cleanup_journal(struct super_block *sb);
537int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
538 level_t level, int child_no, __be64 val);
539void do_logfs_journal_wl_pass(struct super_block *sb);
540
541/* readwrite.c */
542pgoff_t logfs_pack_index(u64 bix, level_t level);
543void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level);
544int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
545 loff_t bix, long flags, struct shadow_tree *shadow_tree);
546int logfs_readpage_nolock(struct page *page);
547int logfs_write_buf(struct inode *inode, struct page *page, long flags);
548int logfs_delete(struct inode *inode, pgoff_t index,
549 struct shadow_tree *shadow_tree);
550int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
551 gc_level_t gc_level, long flags);
552int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
553 gc_level_t gc_level);
554int logfs_truncate(struct inode *inode, u64 size);
555u64 logfs_seek_hole(struct inode *inode, u64 bix);
556u64 logfs_seek_data(struct inode *inode, u64 bix);
557int logfs_open_segfile(struct super_block *sb);
558int logfs_init_rw(struct super_block *sb);
559void logfs_cleanup_rw(struct super_block *sb);
560void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta);
561void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta);
562void logfs_write_block(struct logfs_block *block, long flags);
563int logfs_write_obj_aliases_pagecache(struct super_block *sb);
564void logfs_get_segment_entry(struct super_block *sb, u32 segno,
565 struct logfs_segment_entry *se);
566void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment);
567void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
568 gc_level_t gc_level);
569void logfs_set_segment_reserved(struct super_block *sb, u32 segno);
570void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec);
571struct logfs_block *__alloc_block(struct super_block *sb,
572 u64 ino, u64 bix, level_t level);
573void __free_block(struct super_block *sb, struct logfs_block *block);
574void btree_write_block(struct logfs_block *block);
575void initialize_block_counters(struct page *page, struct logfs_block *block,
576 __be64 *array, int page_is_empty);
577int logfs_exist_block(struct inode *inode, u64 bix);
578int get_page_reserve(struct inode *inode, struct page *page);
579extern struct logfs_block_ops indirect_block_ops;
580
581/* segment.c */
582int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
583int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
584int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
585 level_t level);
586int logfs_segment_write(struct inode *inode, struct page *page,
587 struct logfs_shadow *shadow);
588int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow);
589int logfs_load_object_aliases(struct super_block *sb,
590 struct logfs_obj_alias *oa, int count);
591void move_page_to_btree(struct page *page);
592int logfs_init_mapping(struct super_block *sb);
593void logfs_sync_area(struct logfs_area *area);
594void logfs_sync_segments(struct super_block *sb);
595void freeseg(struct super_block *sb, u32 segno);
596
597/* area handling */
598int logfs_init_areas(struct super_block *sb);
599void logfs_cleanup_areas(struct super_block *sb);
600int logfs_open_area(struct logfs_area *area, size_t bytes);
601void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
602 int use_filler);
603
604static inline void logfs_buf_write(struct logfs_area *area, u64 ofs,
605 void *buf, size_t len)
606{
607 __logfs_buf_write(area, ofs, buf, len, 0);
608}
609
610static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs,
611 void *buf, size_t len)
612{
613 __logfs_buf_write(area, ofs, buf, len, 1);
614}
615
616/* super.c */
617struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
618void emergency_read_end(struct page *page);
619void logfs_crash_dump(struct super_block *sb);
620void *memchr_inv(const void *s, int c, size_t n);
621int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
622int logfs_get_sb_device(struct file_system_type *type, int flags,
623 struct mtd_info *mtd, struct block_device *bdev,
624 const struct logfs_device_ops *devops, struct vfsmount *mnt);
625int logfs_check_ds(struct logfs_disk_super *ds);
626int logfs_write_sb(struct super_block *sb);
627
628static inline struct logfs_super *logfs_super(struct super_block *sb)
629{
630 return sb->s_fs_info;
631}
632
633static inline struct logfs_inode *logfs_inode(struct inode *inode)
634{
635 return container_of(inode, struct logfs_inode, vfs_inode);
636}
637
638static inline void logfs_set_ro(struct super_block *sb)
639{
640 logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO;
641}
642
643#define LOGFS_BUG(sb) do { \
644 struct super_block *__sb = sb; \
645 logfs_crash_dump(__sb); \
646 logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO; \
647 BUG(); \
648} while (0)
649
650#define LOGFS_BUG_ON(condition, sb) \
651 do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0)
652
653static inline __be32 logfs_crc32(void *data, size_t len, size_t skip)
654{
655 return cpu_to_be32(crc32(~0, data+skip, len-skip));
656}
657
658static inline u8 logfs_type(struct inode *inode)
659{
660 return (inode->i_mode >> 12) & 15;
661}
662
663static inline pgoff_t logfs_index(struct super_block *sb, u64 pos)
664{
665 return pos >> sb->s_blocksize_bits;
666}
667
668static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs)
669{
670 return ((u64)segno << logfs_super(sb)->s_segshift) + ofs;
671}
672
673static inline u32 seg_no(struct super_block *sb, u64 ofs)
674{
675 return ofs >> logfs_super(sb)->s_segshift;
676}
677
678static inline u32 seg_ofs(struct super_block *sb, u64 ofs)
679{
680 return ofs & logfs_super(sb)->s_segmask;
681}
682
683static inline u64 seg_align(struct super_block *sb, u64 ofs)
684{
685 return ofs & ~logfs_super(sb)->s_segmask;
686}
687
688static inline struct logfs_block *logfs_block(struct page *page)
689{
690 return (void *)page->private;
691}
692
693static inline level_t shrink_level(gc_level_t __level)
694{
695 u8 level = (__force u8)__level;
696
697 if (level >= LOGFS_MAX_LEVELS)
698 level -= LOGFS_MAX_LEVELS;
699 return (__force level_t)level;
700}
701
702static inline gc_level_t expand_level(u64 ino, level_t __level)
703{
704 u8 level = (__force u8)__level;
705
706 if (ino == LOGFS_INO_MASTER) {
707 /* ifile has seperate areas */
708 level += LOGFS_MAX_LEVELS;
709 }
710 return (__force gc_level_t)level;
711}
712
713static inline int logfs_block_shift(struct super_block *sb, level_t level)
714{
715 level = shrink_level((__force gc_level_t)level);
716 return (__force int)level * (sb->s_blocksize_bits - 3);
717}
718
719static inline u64 logfs_block_mask(struct super_block *sb, level_t level)
720{
721 return ~0ull << logfs_block_shift(sb, level);
722}
723
724static inline struct logfs_area *get_area(struct super_block *sb,
725 gc_level_t gc_level)
726{
727 return logfs_super(sb)->s_area[(__force u8)gc_level];
728}
729
730static inline void logfs_mempool_destroy(mempool_t *pool)
731{
732 if (pool)
733 mempool_destroy(pool);
734}
735
736#endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
new file mode 100644
index 000000000000..f674725663fe
--- /dev/null
+++ b/fs/logfs/logfs_abi.h
@@ -0,0 +1,629 @@
1/*
2 * fs/logfs/logfs_abi.h
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Public header for logfs.
9 */
10#ifndef FS_LOGFS_LOGFS_ABI_H
11#define FS_LOGFS_LOGFS_ABI_H
12
13/* For out-of-kernel compiles */
14#ifndef BUILD_BUG_ON
15#define BUILD_BUG_ON(condition) /**/
16#endif
17
18#define SIZE_CHECK(type, size) \
19static inline void check_##type(void) \
20{ \
21 BUILD_BUG_ON(sizeof(struct type) != (size)); \
22}
23
24/*
25 * Throughout the logfs code, we're constantly dealing with blocks at
26 * various positions or offsets. To remove confusion, we stricly
27 * distinguish between a "position" - the logical position within a
28 * file and an "offset" - the physical location within the device.
29 *
30 * Any usage of the term offset for a logical location or position for
31 * a physical one is a bug and should get fixed.
32 */
33
34/*
35 * Block are allocated in one of several segments depending on their
36 * level. The following levels are used:
37 * 0 - regular data block
38 * 1 - i1 indirect blocks
39 * 2 - i2 indirect blocks
40 * 3 - i3 indirect blocks
41 * 4 - i4 indirect blocks
42 * 5 - i5 indirect blocks
43 * 6 - ifile data blocks
44 * 7 - ifile i1 indirect blocks
45 * 8 - ifile i2 indirect blocks
46 * 9 - ifile i3 indirect blocks
47 * 10 - ifile i4 indirect blocks
48 * 11 - ifile i5 indirect blocks
49 * Potential levels to be used in the future:
50 * 12 - gc recycled blocks, long-lived data
51 * 13 - replacement blocks, short-lived data
52 *
53 * Levels 1-11 are necessary for robust gc operations and help seperate
54 * short-lived metadata from longer-lived file data. In the future,
55 * file data should get seperated into several segments based on simple
56 * heuristics. Old data recycled during gc operation is expected to be
57 * long-lived. New data is of uncertain life expectancy. New data
58 * used to replace older blocks in existing files is expected to be
59 * short-lived.
60 */
61
62
63/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */
64#define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull
65#define LOGFS_MAGIC_U32 0xc97e8168u
66
67/*
68 * Various blocksize related macros. Blocksize is currently fixed at 4KiB.
69 * Sooner or later that should become configurable and the macros replaced
70 * by something superblock-dependent. Pointers in indirect blocks are and
71 * will remain 64bit.
72 *
73 * LOGFS_BLOCKSIZE - self-explaining
74 * LOGFS_BLOCK_FACTOR - number of pointers per indirect block
75 * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts
76 */
77#define LOGFS_BLOCKSIZE (4096ull)
78#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64))
79#define LOGFS_BLOCK_BITS (9)
80
81/*
82 * Number of blocks at various levels of indirection. There are 16 direct
83 * block pointers plus a single indirect pointer.
84 */
85#define I0_BLOCKS (16)
86#define I1_BLOCKS LOGFS_BLOCK_FACTOR
87#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS)
88#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS)
89#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS)
90#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS)
91
92#define INDIRECT_INDEX I0_BLOCKS
93#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1)
94
95/*
96 * Sizes at which files require another level of indirection. Files smaller
97 * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
98 * similar like ext2 fast symlinks.
99 *
100 * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
101 * direct pointers, else through the 1x indirect pointer and so forth.
102 */
103#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64))
104#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE)
105#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE)
106#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE)
107#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE)
108#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE)
109#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE)
110
111/*
112 * Each indirect block pointer must have this flag set, if all block pointers
113 * behind it are set, i.e. there is no hole hidden in the shadow of this
114 * indirect block pointer.
115 */
116#define LOGFS_FULLY_POPULATED (1ULL << 63)
117#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
118
119/*
120 * LogFS needs to seperate data into levels. Each level is defined as the
121 * maximal possible distance from the master inode (inode of the inode file).
122 * Data blocks reside on level 0, 1x indirect block on level 1, etc.
123 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
124 * This effort is necessary to guarantee garbage collection to always make
125 * progress.
126 *
127 * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
128 * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is
129 * the maximal number of levels for one file.
130 * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
131 * effectively stacked on top of each other.
132 */
133#define LOGFS_MAX_INDIRECT (5)
134#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1)
135#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS)
136
137/* Maximum size of filenames */
138#define LOGFS_MAX_NAMELEN (255)
139
140/* Number of segments in the primary journal. */
141#define LOGFS_JOURNAL_SEGS (16)
142
143/* Maximum number of free/erased/etc. segments in journal entries */
144#define MAX_CACHED_SEGS (64)
145
146
147/*
148 * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
149 * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
150 * its header,
151 * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
152 * its segment header and the padded space at the end when no further objects
153 * fit.
154 */
155#define LOGFS_OBJECT_HEADERSIZE (0x1c)
156#define LOGFS_SEGMENT_HEADERSIZE (0x18)
157#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
158#define LOGFS_SEGMENT_RESERVE \
159 (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
160
161/*
162 * Segment types:
163 * SEG_SUPER - Data or indirect block
164 * SEG_JOURNAL - Inode
165 * SEG_OSTORE - Dentry
166 */
167enum {
168 SEG_SUPER = 0x01,
169 SEG_JOURNAL = 0x02,
170 SEG_OSTORE = 0x03,
171};
172
173/**
174 * struct logfs_segment_header - per-segment header in the ostore
175 *
176 * @crc: crc32 of header (there is no data)
177 * @pad: unused, must be 0
178 * @type: segment type, see above
179 * @level: GC level for all objects in this segment
180 * @segno: segment number
181 * @ec: erase count for this segment
182 * @gec: global erase count at time of writing
183 */
184struct logfs_segment_header {
185 __be32 crc;
186 __be16 pad;
187 __u8 type;
188 __u8 level;
189 __be32 segno;
190 __be32 ec;
191 __be64 gec;
192};
193
194SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
195
196#define LOGFS_FEATURES_INCOMPAT (0ull)
197#define LOGFS_FEATURES_RO_COMPAT (0ull)
198#define LOGFS_FEATURES_COMPAT (0ull)
199
200/**
201 * struct logfs_disk_super - on-medium superblock
202 *
203 * @ds_magic: magic number, must equal LOGFS_MAGIC
204 * @ds_crc: crc32 of structure starting with the next field
205 * @ds_ifile_levels: maximum number of levels for ifile
206 * @ds_iblock_levels: maximum number of levels for regular files
207 * @ds_data_levels: number of seperate levels for data
208 * @pad0: reserved, must be 0
209 * @ds_feature_incompat: incompatible filesystem features
210 * @ds_feature_ro_compat: read-only compatible filesystem features
211 * @ds_feature_compat: compatible filesystem features
212 * @ds_flags: flags
213 * @ds_segment_shift: log2 of segment size
214 * @ds_block_shift: log2 of block size
215 * @ds_write_shift: log2 of write size
216 * @pad1: reserved, must be 0
217 * @ds_journal_seg: segments used by primary journal
218 * @ds_root_reserve: bytes reserved for the superuser
219 * @ds_speed_reserve: bytes reserved to speed up GC
220 * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks
221 * @pad2: reserved, must be 0
222 * @pad3: reserved, must be 0
223 *
224 * Contains only read-only fields. Read-write fields like the amount of used
225 * space is tracked in the dynamic superblock, which is stored in the journal.
226 */
227struct logfs_disk_super {
228 struct logfs_segment_header ds_sh;
229 __be64 ds_magic;
230
231 __be32 ds_crc;
232 __u8 ds_ifile_levels;
233 __u8 ds_iblock_levels;
234 __u8 ds_data_levels;
235 __u8 ds_segment_shift;
236 __u8 ds_block_shift;
237 __u8 ds_write_shift;
238 __u8 pad0[6];
239
240 __be64 ds_filesystem_size;
241 __be32 ds_segment_size;
242 __be32 ds_bad_seg_reserve;
243
244 __be64 ds_feature_incompat;
245 __be64 ds_feature_ro_compat;
246
247 __be64 ds_feature_compat;
248 __be64 ds_feature_flags;
249
250 __be64 ds_root_reserve;
251 __be64 ds_speed_reserve;
252
253 __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS];
254
255 __be64 ds_super_ofs[2];
256 __be64 pad3[8];
257};
258
259SIZE_CHECK(logfs_disk_super, 256);
260
261/*
262 * Object types:
263 * OBJ_BLOCK - Data or indirect block
264 * OBJ_INODE - Inode
265 * OBJ_DENTRY - Dentry
266 */
267enum {
268 OBJ_BLOCK = 0x04,
269 OBJ_INODE = 0x05,
270 OBJ_DENTRY = 0x06,
271};
272
273/**
274 * struct logfs_object_header - per-object header in the ostore
275 *
276 * @crc: crc32 of header, excluding data_crc
277 * @len: length of data
278 * @type: object type, see above
279 * @compr: compression type
280 * @ino: inode number
281 * @bix: block index
282 * @data_crc: crc32 of payload
283 */
284struct logfs_object_header {
285 __be32 crc;
286 __be16 len;
287 __u8 type;
288 __u8 compr;
289 __be64 ino;
290 __be64 bix;
291 __be32 data_crc;
292} __attribute__((packed));
293
294SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE);
295
296/*
297 * Reserved inode numbers:
298 * LOGFS_INO_MASTER - master inode (for inode file)
299 * LOGFS_INO_ROOT - root directory
300 * LOGFS_INO_SEGFILE - per-segment used bytes and erase count
301 */
302enum {
303 LOGFS_INO_MAPPING = 0x00,
304 LOGFS_INO_MASTER = 0x01,
305 LOGFS_INO_ROOT = 0x02,
306 LOGFS_INO_SEGFILE = 0x03,
307 LOGFS_RESERVED_INOS = 0x10,
308};
309
310/*
311 * Inode flags. High bits should never be written to the medium. They are
312 * reserved for in-memory usage.
313 * Low bits should either remain in sync with the corresponding FS_*_FL or
314 * reuse slots that obviously don't make sense for logfs.
315 *
316 * LOGFS_IF_DIRTY Inode must be written back
317 * LOGFS_IF_ZOMBIE Inode has been deleted
318 * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode
319 */
320#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */
321#define LOGFS_IF_DIRTY 0x20000000
322#define LOGFS_IF_ZOMBIE 0x40000000
323#define LOGFS_IF_STILLBORN 0x80000000
324
325/* Flags available to chattr */
326#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED)
327#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
328/* Flags inherited from parent directory on file/directory creation */
329#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED)
330
331/**
332 * struct logfs_disk_inode - on-medium inode
333 *
334 * @di_mode: file mode
335 * @di_pad: reserved, must be 0
336 * @di_flags: inode flags, see above
337 * @di_uid: user id
338 * @di_gid: group id
339 * @di_ctime: change time
340 * @di_mtime: modify time
341 * @di_refcount: reference count (aka nlink or link count)
342 * @di_generation: inode generation, for nfs
343 * @di_used_bytes: number of bytes used
344 * @di_size: file size
345 * @di_data: data pointers
346 */
347struct logfs_disk_inode {
348 __be16 di_mode;
349 __u8 di_height;
350 __u8 di_pad;
351 __be32 di_flags;
352 __be32 di_uid;
353 __be32 di_gid;
354
355 __be64 di_ctime;
356 __be64 di_mtime;
357
358 __be64 di_atime;
359 __be32 di_refcount;
360 __be32 di_generation;
361
362 __be64 di_used_bytes;
363 __be64 di_size;
364
365 __be64 di_data[LOGFS_EMBEDDED_FIELDS];
366};
367
368SIZE_CHECK(logfs_disk_inode, 200);
369
370#define INODE_POINTER_OFS \
371 (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
372#define INODE_USED_OFS \
373 (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
374#define INODE_SIZE_OFS \
375 (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
376#define INODE_HEIGHT_OFS (0)
377
378/**
379 * struct logfs_disk_dentry - on-medium dentry structure
380 *
381 * @ino: inode number
382 * @namelen: length of file name
383 * @type: file type, identical to bits 12..15 of mode
384 * @name: file name
385 */
386/* FIXME: add 6 bytes of padding to remove the __packed */
387struct logfs_disk_dentry {
388 __be64 ino;
389 __be16 namelen;
390 __u8 type;
391 __u8 name[LOGFS_MAX_NAMELEN];
392} __attribute__((packed));
393
394SIZE_CHECK(logfs_disk_dentry, 266);
395
396#define RESERVED 0xffffffff
397#define BADSEG 0xffffffff
398/**
399 * struct logfs_segment_entry - segment file entry
400 *
401 * @ec_level: erase count and level
402 * @valid: number of valid bytes
403 *
404 * Segment file contains one entry for every segment. ec_level contains the
405 * erasecount in the upper 28 bits and the level in the lower 4 bits. An
406 * ec_level of BADSEG (-1) identifies bad segments. valid contains the number
407 * of valid bytes or RESERVED (-1 again) if the segment is used for either the
408 * superblock or the journal, or when the segment is bad.
409 */
410struct logfs_segment_entry {
411 __be32 ec_level;
412 __be32 valid;
413};
414
415SIZE_CHECK(logfs_segment_entry, 8);
416
417/**
418 * struct logfs_journal_header - header for journal entries (JEs)
419 *
420 * @h_crc: crc32 of journal entry
421 * @h_len: length of compressed journal entry,
422 * not including header
423 * @h_datalen: length of uncompressed data
424 * @h_type: JE type
425 * @h_compr: compression type
426 * @h_pad: reserved
427 */
428struct logfs_journal_header {
429 __be32 h_crc;
430 __be16 h_len;
431 __be16 h_datalen;
432 __be16 h_type;
433 __u8 h_compr;
434 __u8 h_pad[5];
435};
436
437SIZE_CHECK(logfs_journal_header, 16);
438
439/*
440 * Life expectency of data.
441 * VIM_DEFAULT - default vim
442 * VIM_SEGFILE - for segment file only - very short-living
443 * VIM_GC - GC'd data - likely long-living
444 */
445enum logfs_vim {
446 VIM_DEFAULT = 0,
447 VIM_SEGFILE = 1,
448};
449
450/**
451 * struct logfs_je_area - wbuf header
452 *
453 * @segno: segment number of area
454 * @used_bytes: number of bytes already used
455 * @gc_level: GC level
456 * @vim: life expectancy of data
457 *
458 * "Areas" are segments currently being used for writing. There is at least
459 * one area per GC level. Several may be used to seperate long-living from
460 * short-living data. If an area with unknown vim is encountered, it can
461 * simply be closed.
462 * The write buffer immediately follow this header.
463 */
464struct logfs_je_area {
465 __be32 segno;
466 __be32 used_bytes;
467 __u8 gc_level;
468 __u8 vim;
469} __attribute__((packed));
470
471SIZE_CHECK(logfs_je_area, 10);
472
473#define MAX_JOURNAL_HEADER \
474 (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
475
476/**
477 * struct logfs_je_dynsb - dynamic superblock
478 *
479 * @ds_gec: global erase count
480 * @ds_sweeper: current position of GC "sweeper"
481 * @ds_rename_dir: source directory ino (see dir.c documentation)
482 * @ds_rename_pos: position of source dd (see dir.c documentation)
483 * @ds_victim_ino: victims of incomplete dir operation (see dir.c)
484 * @ds_victim_ino: parent inode of victim (see dir.c)
485 * @ds_used_bytes: number of used bytes
486 */
487struct logfs_je_dynsb {
488 __be64 ds_gec;
489 __be64 ds_sweeper;
490
491 __be64 ds_rename_dir;
492 __be64 ds_rename_pos;
493
494 __be64 ds_victim_ino;
495 __be64 ds_victim_parent; /* XXX */
496
497 __be64 ds_used_bytes;
498 __be32 ds_generation;
499 __be32 pad;
500};
501
502SIZE_CHECK(logfs_je_dynsb, 64);
503
504/**
505 * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
506 *
507 * @da_size: size of inode file
508 * @da_last_ino: last created inode
509 * @da_used_bytes: number of bytes used
510 * @da_data: data pointers
511 */
512struct logfs_je_anchor {
513 __be64 da_size;
514 __be64 da_last_ino;
515
516 __be64 da_used_bytes;
517 u8 da_height;
518 u8 pad[7];
519
520 __be64 da_data[LOGFS_EMBEDDED_FIELDS];
521};
522
523SIZE_CHECK(logfs_je_anchor, 168);
524
525/**
526 * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
527 *
528 * @so_segment: segments used for 2nd journal
529 *
530 * Length of the array is given by h_len field in the header.
531 */
532struct logfs_je_spillout {
533 __be64 so_segment[0];
534};
535
536SIZE_CHECK(logfs_je_spillout, 0);
537
538/**
539 * struct logfs_je_journal_ec - erase counts for all journal segments
540 *
541 * @ec: erase count
542 *
543 * Length of the array is given by h_len field in the header.
544 */
545struct logfs_je_journal_ec {
546 __be32 ec[0];
547};
548
549SIZE_CHECK(logfs_je_journal_ec, 0);
550
551/**
552 * struct logfs_je_free_segments - list of free segmetns with erase count
553 */
554struct logfs_je_free_segments {
555 __be32 segno;
556 __be32 ec;
557};
558
559SIZE_CHECK(logfs_je_free_segments, 8);
560
561/**
562 * struct logfs_seg_alias - list of segment aliases
563 */
564struct logfs_seg_alias {
565 __be32 old_segno;
566 __be32 new_segno;
567};
568
569SIZE_CHECK(logfs_seg_alias, 8);
570
571/**
572 * struct logfs_obj_alias - list of object aliases
573 */
574struct logfs_obj_alias {
575 __be64 ino;
576 __be64 bix;
577 __be64 val;
578 u8 level;
579 u8 pad[5];
580 __be16 child_no;
581};
582
583SIZE_CHECK(logfs_obj_alias, 32);
584
585/**
586 * Compression types.
587 *
588 * COMPR_NONE - uncompressed
589 * COMPR_ZLIB - compressed with zlib
590 */
591enum {
592 COMPR_NONE = 0,
593 COMPR_ZLIB = 1,
594};
595
596/*
597 * Journal entries come in groups of 16. First group contains unique
598 * entries, next groups contain one entry per level
599 *
600 * JE_FIRST - smallest possible journal entry number
601 *
602 * JEG_BASE - base group, containing unique entries
603 * JE_COMMIT - commit entry, validates all previous entries
604 * JE_DYNSB - dynamic superblock, anything that ought to be in the
605 * superblock but cannot because it is read-write data
606 * JE_ANCHOR - anchor aka master inode aka inode file's inode
607 * JE_ERASECOUNT erasecounts for all journal segments
608 * JE_SPILLOUT - unused
609 * JE_SEG_ALIAS - aliases segments
610 * JE_AREA - area description
611 *
612 * JE_LAST - largest possible journal entry number
613 */
614enum {
615 JE_FIRST = 0x01,
616
617 JEG_BASE = 0x00,
618 JE_COMMIT = 0x02,
619 JE_DYNSB = 0x03,
620 JE_ANCHOR = 0x04,
621 JE_ERASECOUNT = 0x05,
622 JE_SPILLOUT = 0x06,
623 JE_OBJ_ALIAS = 0x0d,
624 JE_AREA = 0x0e,
625
626 JE_LAST = 0x0e,
627};
628
629#endif
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
new file mode 100644
index 000000000000..3159db6958e5
--- /dev/null
+++ b/fs/logfs/readwrite.c
@@ -0,0 +1,2267 @@
1/*
2 * fs/logfs/readwrite.c
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 *
9 * Actually contains five sets of very similar functions:
10 * read read blocks from a file
11 * seek_hole find next hole
12 * seek_data find next data block
13 * valid check whether a block still belongs to a file
14 * write write blocks to a file
15 * delete delete a block (for directories and ifile)
16 * rewrite move existing blocks of a file to a new location (gc helper)
17 * truncate truncate a file
18 */
19#include "logfs.h"
20#include <linux/sched.h>
21#include <linux/slab.h>
22
23static u64 adjust_bix(u64 bix, level_t level)
24{
25 switch (level) {
26 case 0:
27 return bix;
28 case LEVEL(1):
29 return max_t(u64, bix, I0_BLOCKS);
30 case LEVEL(2):
31 return max_t(u64, bix, I1_BLOCKS);
32 case LEVEL(3):
33 return max_t(u64, bix, I2_BLOCKS);
34 case LEVEL(4):
35 return max_t(u64, bix, I3_BLOCKS);
36 case LEVEL(5):
37 return max_t(u64, bix, I4_BLOCKS);
38 default:
39 WARN_ON(1);
40 return bix;
41 }
42}
43
44static inline u64 maxbix(u8 height)
45{
46 return 1ULL << (LOGFS_BLOCK_BITS * height);
47}
48
49/**
50 * The inode address space is cut in two halves. Lower half belongs to data
51 * pages, upper half to indirect blocks. If the high bit (INDIRECT_BIT) is
52 * set, the actual block index (bix) and level can be derived from the page
53 * index.
54 *
55 * The lowest three bits of the block index are set to 0 after packing and
56 * unpacking. Since the lowest n bits (9 for 4KiB blocksize) are ignored
57 * anyway this is harmless.
58 */
59#define ARCH_SHIFT (BITS_PER_LONG - 32)
60#define INDIRECT_BIT (0x80000000UL << ARCH_SHIFT)
61#define LEVEL_SHIFT (28 + ARCH_SHIFT)
62static inline pgoff_t first_indirect_block(void)
63{
64 return INDIRECT_BIT | (1ULL << LEVEL_SHIFT);
65}
66
67pgoff_t logfs_pack_index(u64 bix, level_t level)
68{
69 pgoff_t index;
70
71 BUG_ON(bix >= INDIRECT_BIT);
72 if (level == 0)
73 return bix;
74
75 index = INDIRECT_BIT;
76 index |= (__force long)level << LEVEL_SHIFT;
77 index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS);
78 return index;
79}
80
81void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level)
82{
83 u8 __level;
84
85 if (!(index & INDIRECT_BIT)) {
86 *bix = index;
87 *level = 0;
88 return;
89 }
90
91 __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
92 *level = LEVEL(__level);
93 *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
94 *bix = adjust_bix(*bix, *level);
95 return;
96}
97#undef ARCH_SHIFT
98#undef INDIRECT_BIT
99#undef LEVEL_SHIFT
100
101/*
102 * Time is stored as nanoseconds since the epoch.
103 */
104static struct timespec be64_to_timespec(__be64 betime)
105{
106 return ns_to_timespec(be64_to_cpu(betime));
107}
108
109static __be64 timespec_to_be64(struct timespec tsp)
110{
111 return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
112}
113
114static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
115{
116 struct logfs_inode *li = logfs_inode(inode);
117 int i;
118
119 inode->i_mode = be16_to_cpu(di->di_mode);
120 li->li_height = di->di_height;
121 li->li_flags = be32_to_cpu(di->di_flags);
122 inode->i_uid = be32_to_cpu(di->di_uid);
123 inode->i_gid = be32_to_cpu(di->di_gid);
124 inode->i_size = be64_to_cpu(di->di_size);
125 logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
126 inode->i_atime = be64_to_timespec(di->di_atime);
127 inode->i_ctime = be64_to_timespec(di->di_ctime);
128 inode->i_mtime = be64_to_timespec(di->di_mtime);
129 inode->i_nlink = be32_to_cpu(di->di_refcount);
130 inode->i_generation = be32_to_cpu(di->di_generation);
131
132 switch (inode->i_mode & S_IFMT) {
133 case S_IFSOCK: /* fall through */
134 case S_IFBLK: /* fall through */
135 case S_IFCHR: /* fall through */
136 case S_IFIFO:
137 inode->i_rdev = be64_to_cpu(di->di_data[0]);
138 break;
139 case S_IFDIR: /* fall through */
140 case S_IFREG: /* fall through */
141 case S_IFLNK:
142 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
143 li->li_data[i] = be64_to_cpu(di->di_data[i]);
144 break;
145 default:
146 BUG();
147 }
148}
149
150static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
151{
152 struct logfs_inode *li = logfs_inode(inode);
153 int i;
154
155 di->di_mode = cpu_to_be16(inode->i_mode);
156 di->di_height = li->li_height;
157 di->di_pad = 0;
158 di->di_flags = cpu_to_be32(li->li_flags);
159 di->di_uid = cpu_to_be32(inode->i_uid);
160 di->di_gid = cpu_to_be32(inode->i_gid);
161 di->di_size = cpu_to_be64(i_size_read(inode));
162 di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
163 di->di_atime = timespec_to_be64(inode->i_atime);
164 di->di_ctime = timespec_to_be64(inode->i_ctime);
165 di->di_mtime = timespec_to_be64(inode->i_mtime);
166 di->di_refcount = cpu_to_be32(inode->i_nlink);
167 di->di_generation = cpu_to_be32(inode->i_generation);
168
169 switch (inode->i_mode & S_IFMT) {
170 case S_IFSOCK: /* fall through */
171 case S_IFBLK: /* fall through */
172 case S_IFCHR: /* fall through */
173 case S_IFIFO:
174 di->di_data[0] = cpu_to_be64(inode->i_rdev);
175 break;
176 case S_IFDIR: /* fall through */
177 case S_IFREG: /* fall through */
178 case S_IFLNK:
179 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
180 di->di_data[i] = cpu_to_be64(li->li_data[i]);
181 break;
182 default:
183 BUG();
184 }
185}
186
187static void __logfs_set_blocks(struct inode *inode)
188{
189 struct super_block *sb = inode->i_sb;
190 struct logfs_inode *li = logfs_inode(inode);
191
192 inode->i_blocks = ULONG_MAX;
193 if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
194 inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
195}
196
197void logfs_set_blocks(struct inode *inode, u64 bytes)
198{
199 struct logfs_inode *li = logfs_inode(inode);
200
201 li->li_used_bytes = bytes;
202 __logfs_set_blocks(inode);
203}
204
205static void prelock_page(struct super_block *sb, struct page *page, int lock)
206{
207 struct logfs_super *super = logfs_super(sb);
208
209 BUG_ON(!PageLocked(page));
210 if (lock) {
211 BUG_ON(PagePreLocked(page));
212 SetPagePreLocked(page);
213 } else {
214 /* We are in GC path. */
215 if (PagePreLocked(page))
216 super->s_lock_count++;
217 else
218 SetPagePreLocked(page);
219 }
220}
221
222static void preunlock_page(struct super_block *sb, struct page *page, int lock)
223{
224 struct logfs_super *super = logfs_super(sb);
225
226 BUG_ON(!PageLocked(page));
227 if (lock)
228 ClearPagePreLocked(page);
229 else {
230 /* We are in GC path. */
231 BUG_ON(!PagePreLocked(page));
232 if (super->s_lock_count)
233 super->s_lock_count--;
234 else
235 ClearPagePreLocked(page);
236 }
237}
238
239/*
240 * Logfs is prone to an AB-BA deadlock where one task tries to acquire
241 * s_write_mutex with a locked page and GC tries to get that page while holding
242 * s_write_mutex.
243 * To solve this issue logfs will ignore the page lock iff the page in question
244 * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked
245 * in addition to PG_locked.
246 */
247static void logfs_get_wblocks(struct super_block *sb, struct page *page,
248 int lock)
249{
250 struct logfs_super *super = logfs_super(sb);
251
252 if (page)
253 prelock_page(sb, page, lock);
254
255 if (lock) {
256 mutex_lock(&super->s_write_mutex);
257 logfs_gc_pass(sb);
258 /* FIXME: We also have to check for shadowed space
259 * and mempool fill grade */
260 }
261}
262
263static void logfs_put_wblocks(struct super_block *sb, struct page *page,
264 int lock)
265{
266 struct logfs_super *super = logfs_super(sb);
267
268 if (page)
269 preunlock_page(sb, page, lock);
270 /* Order matters - we must clear PG_pre_locked before releasing
271 * s_write_mutex or we could race against another task. */
272 if (lock)
273 mutex_unlock(&super->s_write_mutex);
274}
275
276static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
277 level_t level)
278{
279 return find_or_create_page(inode->i_mapping,
280 logfs_pack_index(bix, level), GFP_NOFS);
281}
282
283static void logfs_put_read_page(struct page *page)
284{
285 unlock_page(page);
286 page_cache_release(page);
287}
288
289static void logfs_lock_write_page(struct page *page)
290{
291 int loop = 0;
292
293 while (unlikely(!trylock_page(page))) {
294 if (loop++ > 0x1000) {
295 /* Has been observed once so far... */
296 printk(KERN_ERR "stack at %p\n", &loop);
297 BUG();
298 }
299 if (PagePreLocked(page)) {
300 /* Holder of page lock is waiting for us, it
301 * is safe to use this page. */
302 break;
303 }
304 /* Some other process has this page locked and has
305 * nothing to do with us. Wait for it to finish.
306 */
307 schedule();
308 }
309 BUG_ON(!PageLocked(page));
310}
311
312static struct page *logfs_get_write_page(struct inode *inode, u64 bix,
313 level_t level)
314{
315 struct address_space *mapping = inode->i_mapping;
316 pgoff_t index = logfs_pack_index(bix, level);
317 struct page *page;
318 int err;
319
320repeat:
321 page = find_get_page(mapping, index);
322 if (!page) {
323 page = __page_cache_alloc(GFP_NOFS);
324 if (!page)
325 return NULL;
326 err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
327 if (unlikely(err)) {
328 page_cache_release(page);
329 if (err == -EEXIST)
330 goto repeat;
331 return NULL;
332 }
333 } else logfs_lock_write_page(page);
334 BUG_ON(!PageLocked(page));
335 return page;
336}
337
338static void logfs_unlock_write_page(struct page *page)
339{
340 if (!PagePreLocked(page))
341 unlock_page(page);
342}
343
344static void logfs_put_write_page(struct page *page)
345{
346 logfs_unlock_write_page(page);
347 page_cache_release(page);
348}
349
350static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
351 int rw)
352{
353 if (rw == READ)
354 return logfs_get_read_page(inode, bix, level);
355 else
356 return logfs_get_write_page(inode, bix, level);
357}
358
359static void logfs_put_page(struct page *page, int rw)
360{
361 if (rw == READ)
362 logfs_put_read_page(page);
363 else
364 logfs_put_write_page(page);
365}
366
367static unsigned long __get_bits(u64 val, int skip, int no)
368{
369 u64 ret = val;
370
371 ret >>= skip * no;
372 ret <<= 64 - no;
373 ret >>= 64 - no;
374 return ret;
375}
376
377static unsigned long get_bits(u64 val, level_t skip)
378{
379 return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS);
380}
381
382static inline void init_shadow_tree(struct super_block *sb,
383 struct shadow_tree *tree)
384{
385 struct logfs_super *super = logfs_super(sb);
386
387 btree_init_mempool64(&tree->new, super->s_btree_pool);
388 btree_init_mempool64(&tree->old, super->s_btree_pool);
389}
390
391static void indirect_write_block(struct logfs_block *block)
392{
393 struct page *page;
394 struct inode *inode;
395 int ret;
396
397 page = block->page;
398 inode = page->mapping->host;
399 logfs_lock_write_page(page);
400 ret = logfs_write_buf(inode, page, 0);
401 logfs_unlock_write_page(page);
402 /*
403 * This needs some rework. Unless you want your filesystem to run
404 * completely synchronously (you don't), the filesystem will always
405 * report writes as 'successful' before the actual work has been
406 * done. The actual work gets done here and this is where any errors
407 * will show up. And there isn't much we can do about it, really.
408 *
409 * Some attempts to fix the errors (move from bad blocks, retry io,...)
410 * have already been done, so anything left should be either a broken
411 * device or a bug somewhere in logfs itself. Being relatively new,
412 * the odds currently favor a bug, so for now the line below isn't
413 * entirely tasteles.
414 */
415 BUG_ON(ret);
416}
417
418static void inode_write_block(struct logfs_block *block)
419{
420 struct inode *inode;
421 int ret;
422
423 inode = block->inode;
424 if (inode->i_ino == LOGFS_INO_MASTER)
425 logfs_write_anchor(inode->i_sb);
426 else {
427 ret = __logfs_write_inode(inode, 0);
428 /* see indirect_write_block comment */
429 BUG_ON(ret);
430 }
431}
432
433/*
434 * This silences a false, yet annoying gcc warning. I hate it when my editor
435 * jumps into bitops.h each time I recompile this file.
436 * TODO: Complain to gcc folks about this and upgrade compiler.
437 */
438static unsigned long fnb(const unsigned long *addr,
439 unsigned long size, unsigned long offset)
440{
441 return find_next_bit(addr, size, offset);
442}
443
444static __be64 inode_val0(struct inode *inode)
445{
446 struct logfs_inode *li = logfs_inode(inode);
447 u64 val;
448
449 /*
450 * Explicit shifting generates good code, but must match the format
451 * of the structure. Add some paranoia just in case.
452 */
453 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0);
454 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2);
455 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4);
456
457 val = (u64)inode->i_mode << 48 |
458 (u64)li->li_height << 40 |
459 (u64)li->li_flags;
460 return cpu_to_be64(val);
461}
462
463static int inode_write_alias(struct super_block *sb,
464 struct logfs_block *block, write_alias_t *write_one_alias)
465{
466 struct inode *inode = block->inode;
467 struct logfs_inode *li = logfs_inode(inode);
468 unsigned long pos;
469 u64 ino , bix;
470 __be64 val;
471 level_t level;
472 int err;
473
474 for (pos = 0; ; pos++) {
475 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
476 if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS)
477 return 0;
478
479 switch (pos) {
480 case INODE_HEIGHT_OFS:
481 val = inode_val0(inode);
482 break;
483 case INODE_USED_OFS:
484 val = cpu_to_be64(li->li_used_bytes);;
485 break;
486 case INODE_SIZE_OFS:
487 val = cpu_to_be64(i_size_read(inode));
488 break;
489 case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1:
490 val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]);
491 break;
492 default:
493 BUG();
494 }
495
496 ino = LOGFS_INO_MASTER;
497 bix = inode->i_ino;
498 level = LEVEL(0);
499 err = write_one_alias(sb, ino, bix, level, pos, val);
500 if (err)
501 return err;
502 }
503}
504
505static int indirect_write_alias(struct super_block *sb,
506 struct logfs_block *block, write_alias_t *write_one_alias)
507{
508 unsigned long pos;
509 struct page *page = block->page;
510 u64 ino , bix;
511 __be64 *child, val;
512 level_t level;
513 int err;
514
515 for (pos = 0; ; pos++) {
516 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
517 if (pos >= LOGFS_BLOCK_FACTOR)
518 return 0;
519
520 ino = page->mapping->host->i_ino;
521 logfs_unpack_index(page->index, &bix, &level);
522 child = kmap_atomic(page, KM_USER0);
523 val = child[pos];
524 kunmap_atomic(child, KM_USER0);
525 err = write_one_alias(sb, ino, bix, level, pos, val);
526 if (err)
527 return err;
528 }
529}
530
531int logfs_write_obj_aliases_pagecache(struct super_block *sb)
532{
533 struct logfs_super *super = logfs_super(sb);
534 struct logfs_block *block;
535 int err;
536
537 list_for_each_entry(block, &super->s_object_alias, alias_list) {
538 err = block->ops->write_alias(sb, block, write_alias_journal);
539 if (err)
540 return err;
541 }
542 return 0;
543}
544
545void __free_block(struct super_block *sb, struct logfs_block *block)
546{
547 BUG_ON(!list_empty(&block->item_list));
548 list_del(&block->alias_list);
549 mempool_free(block, logfs_super(sb)->s_block_pool);
550}
551
552static void inode_free_block(struct super_block *sb, struct logfs_block *block)
553{
554 struct inode *inode = block->inode;
555
556 logfs_inode(inode)->li_block = NULL;
557 __free_block(sb, block);
558}
559
560static void indirect_free_block(struct super_block *sb,
561 struct logfs_block *block)
562{
563 ClearPagePrivate(block->page);
564 block->page->private = 0;
565 __free_block(sb, block);
566}
567
568
569static struct logfs_block_ops inode_block_ops = {
570 .write_block = inode_write_block,
571 .free_block = inode_free_block,
572 .write_alias = inode_write_alias,
573};
574
575struct logfs_block_ops indirect_block_ops = {
576 .write_block = indirect_write_block,
577 .free_block = indirect_free_block,
578 .write_alias = indirect_write_alias,
579};
580
581struct logfs_block *__alloc_block(struct super_block *sb,
582 u64 ino, u64 bix, level_t level)
583{
584 struct logfs_super *super = logfs_super(sb);
585 struct logfs_block *block;
586
587 block = mempool_alloc(super->s_block_pool, GFP_NOFS);
588 memset(block, 0, sizeof(*block));
589 INIT_LIST_HEAD(&block->alias_list);
590 INIT_LIST_HEAD(&block->item_list);
591 block->sb = sb;
592 block->ino = ino;
593 block->bix = bix;
594 block->level = level;
595 return block;
596}
597
598static void alloc_inode_block(struct inode *inode)
599{
600 struct logfs_inode *li = logfs_inode(inode);
601 struct logfs_block *block;
602
603 if (li->li_block)
604 return;
605
606 block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0);
607 block->inode = inode;
608 li->li_block = block;
609 block->ops = &inode_block_ops;
610}
611
612void initialize_block_counters(struct page *page, struct logfs_block *block,
613 __be64 *array, int page_is_empty)
614{
615 u64 ptr;
616 int i, start;
617
618 block->partial = 0;
619 block->full = 0;
620 start = 0;
621 if (page->index < first_indirect_block()) {
622 /* Counters are pointless on level 0 */
623 return;
624 }
625 if (page->index == first_indirect_block()) {
626 /* Skip unused pointers */
627 start = I0_BLOCKS;
628 block->full = I0_BLOCKS;
629 }
630 if (!page_is_empty) {
631 for (i = start; i < LOGFS_BLOCK_FACTOR; i++) {
632 ptr = be64_to_cpu(array[i]);
633 if (ptr)
634 block->partial++;
635 if (ptr & LOGFS_FULLY_POPULATED)
636 block->full++;
637 }
638 }
639}
640
641static void alloc_data_block(struct inode *inode, struct page *page)
642{
643 struct logfs_block *block;
644 u64 bix;
645 level_t level;
646
647 if (PagePrivate(page))
648 return;
649
650 logfs_unpack_index(page->index, &bix, &level);
651 block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
652 block->page = page;
653 SetPagePrivate(page);
654 page->private = (unsigned long)block;
655 block->ops = &indirect_block_ops;
656}
657
658static void alloc_indirect_block(struct inode *inode, struct page *page,
659 int page_is_empty)
660{
661 struct logfs_block *block;
662 __be64 *array;
663
664 if (PagePrivate(page))
665 return;
666
667 alloc_data_block(inode, page);
668
669 block = logfs_block(page);
670 array = kmap_atomic(page, KM_USER0);
671 initialize_block_counters(page, block, array, page_is_empty);
672 kunmap_atomic(array, KM_USER0);
673}
674
675static void block_set_pointer(struct page *page, int index, u64 ptr)
676{
677 struct logfs_block *block = logfs_block(page);
678 __be64 *array;
679 u64 oldptr;
680
681 BUG_ON(!block);
682 array = kmap_atomic(page, KM_USER0);
683 oldptr = be64_to_cpu(array[index]);
684 array[index] = cpu_to_be64(ptr);
685 kunmap_atomic(array, KM_USER0);
686 SetPageUptodate(page);
687
688 block->full += !!(ptr & LOGFS_FULLY_POPULATED)
689 - !!(oldptr & LOGFS_FULLY_POPULATED);
690 block->partial += !!ptr - !!oldptr;
691}
692
693static u64 block_get_pointer(struct page *page, int index)
694{
695 __be64 *block;
696 u64 ptr;
697
698 block = kmap_atomic(page, KM_USER0);
699 ptr = be64_to_cpu(block[index]);
700 kunmap_atomic(block, KM_USER0);
701 return ptr;
702}
703
704static int logfs_read_empty(struct page *page)
705{
706 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
707 return 0;
708}
709
710static int logfs_read_direct(struct inode *inode, struct page *page)
711{
712 struct logfs_inode *li = logfs_inode(inode);
713 pgoff_t index = page->index;
714 u64 block;
715
716 block = li->li_data[index];
717 if (!block)
718 return logfs_read_empty(page);
719
720 return logfs_segment_read(inode, page, block, index, 0);
721}
722
723static int logfs_read_loop(struct inode *inode, struct page *page,
724 int rw_context)
725{
726 struct logfs_inode *li = logfs_inode(inode);
727 u64 bix, bofs = li->li_data[INDIRECT_INDEX];
728 level_t level, target_level;
729 int ret;
730 struct page *ipage;
731
732 logfs_unpack_index(page->index, &bix, &target_level);
733 if (!bofs)
734 return logfs_read_empty(page);
735
736 if (bix >= maxbix(li->li_height))
737 return logfs_read_empty(page);
738
739 for (level = LEVEL(li->li_height);
740 (__force u8)level > (__force u8)target_level;
741 level = SUBLEVEL(level)){
742 ipage = logfs_get_page(inode, bix, level, rw_context);
743 if (!ipage)
744 return -ENOMEM;
745
746 ret = logfs_segment_read(inode, ipage, bofs, bix, level);
747 if (ret) {
748 logfs_put_read_page(ipage);
749 return ret;
750 }
751
752 bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
753 logfs_put_page(ipage, rw_context);
754 if (!bofs)
755 return logfs_read_empty(page);
756 }
757
758 return logfs_segment_read(inode, page, bofs, bix, 0);
759}
760
761static int logfs_read_block(struct inode *inode, struct page *page,
762 int rw_context)
763{
764 pgoff_t index = page->index;
765
766 if (index < I0_BLOCKS)
767 return logfs_read_direct(inode, page);
768 return logfs_read_loop(inode, page, rw_context);
769}
770
771static int logfs_exist_loop(struct inode *inode, u64 bix)
772{
773 struct logfs_inode *li = logfs_inode(inode);
774 u64 bofs = li->li_data[INDIRECT_INDEX];
775 level_t level;
776 int ret;
777 struct page *ipage;
778
779 if (!bofs)
780 return 0;
781 if (bix >= maxbix(li->li_height))
782 return 0;
783
784 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
785 ipage = logfs_get_read_page(inode, bix, level);
786 if (!ipage)
787 return -ENOMEM;
788
789 ret = logfs_segment_read(inode, ipage, bofs, bix, level);
790 if (ret) {
791 logfs_put_read_page(ipage);
792 return ret;
793 }
794
795 bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
796 logfs_put_read_page(ipage);
797 if (!bofs)
798 return 0;
799 }
800
801 return 1;
802}
803
804int logfs_exist_block(struct inode *inode, u64 bix)
805{
806 struct logfs_inode *li = logfs_inode(inode);
807
808 if (bix < I0_BLOCKS)
809 return !!li->li_data[bix];
810 return logfs_exist_loop(inode, bix);
811}
812
813static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
814{
815 struct logfs_inode *li = logfs_inode(inode);
816
817 for (; bix < I0_BLOCKS; bix++)
818 if (data ^ (li->li_data[bix] == 0))
819 return bix;
820 return I0_BLOCKS;
821}
822
823static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
824{
825 struct logfs_inode *li = logfs_inode(inode);
826 __be64 *rblock;
827 u64 increment, bofs = li->li_data[INDIRECT_INDEX];
828 level_t level;
829 int ret, slot;
830 struct page *page;
831
832 BUG_ON(!bofs);
833
834 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
835 increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1));
836 page = logfs_get_read_page(inode, bix, level);
837 if (!page)
838 return bix;
839
840 ret = logfs_segment_read(inode, page, bofs, bix, level);
841 if (ret) {
842 logfs_put_read_page(page);
843 return bix;
844 }
845
846 slot = get_bits(bix, SUBLEVEL(level));
847 rblock = kmap_atomic(page, KM_USER0);
848 while (slot < LOGFS_BLOCK_FACTOR) {
849 if (data && (rblock[slot] != 0))
850 break;
851 if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
852 break;
853 slot++;
854 bix += increment;
855 bix &= ~(increment - 1);
856 }
857 if (slot >= LOGFS_BLOCK_FACTOR) {
858 kunmap_atomic(rblock, KM_USER0);
859 logfs_put_read_page(page);
860 return bix;
861 }
862 bofs = be64_to_cpu(rblock[slot]);
863 kunmap_atomic(rblock, KM_USER0);
864 logfs_put_read_page(page);
865 if (!bofs) {
866 BUG_ON(data);
867 return bix;
868 }
869 }
870 return bix;
871}
872
873/**
874 * logfs_seek_hole - find next hole starting at a given block index
875 * @inode: inode to search in
876 * @bix: block index to start searching
877 *
878 * Returns next hole. If the file doesn't contain any further holes, the
879 * block address next to eof is returned instead.
880 */
881u64 logfs_seek_hole(struct inode *inode, u64 bix)
882{
883 struct logfs_inode *li = logfs_inode(inode);
884
885 if (bix < I0_BLOCKS) {
886 bix = seek_holedata_direct(inode, bix, 0);
887 if (bix < I0_BLOCKS)
888 return bix;
889 }
890
891 if (!li->li_data[INDIRECT_INDEX])
892 return bix;
893 else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
894 bix = maxbix(li->li_height);
895 else {
896 bix = seek_holedata_loop(inode, bix, 0);
897 if (bix < maxbix(li->li_height))
898 return bix;
899 /* Should not happen anymore. But if some port writes semi-
900 * corrupt images (as this one used to) we might run into it.
901 */
902 WARN_ON_ONCE(bix == maxbix(li->li_height));
903 }
904
905 return bix;
906}
907
908static u64 __logfs_seek_data(struct inode *inode, u64 bix)
909{
910 struct logfs_inode *li = logfs_inode(inode);
911
912 if (bix < I0_BLOCKS) {
913 bix = seek_holedata_direct(inode, bix, 1);
914 if (bix < I0_BLOCKS)
915 return bix;
916 }
917
918 if (bix < maxbix(li->li_height)) {
919 if (!li->li_data[INDIRECT_INDEX])
920 bix = maxbix(li->li_height);
921 else
922 return seek_holedata_loop(inode, bix, 1);
923 }
924
925 return bix;
926}
927
928/**
929 * logfs_seek_data - find next data block after a given block index
930 * @inode: inode to search in
931 * @bix: block index to start searching
932 *
933 * Returns next data block. If the file doesn't contain any further data
934 * blocks, the last block in the file is returned instead.
935 */
936u64 logfs_seek_data(struct inode *inode, u64 bix)
937{
938 struct super_block *sb = inode->i_sb;
939 u64 ret, end;
940
941 ret = __logfs_seek_data(inode, bix);
942 end = i_size_read(inode) >> sb->s_blocksize_bits;
943 if (ret >= end)
944 ret = max(bix, end);
945 return ret;
946}
947
948static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs)
949{
950 return pure_ofs(li->li_data[bix]) == ofs;
951}
952
953static int __logfs_is_valid_loop(struct inode *inode, u64 bix,
954 u64 ofs, u64 bofs)
955{
956 struct logfs_inode *li = logfs_inode(inode);
957 level_t level;
958 int ret;
959 struct page *page;
960
961 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){
962 page = logfs_get_write_page(inode, bix, level);
963 BUG_ON(!page);
964
965 ret = logfs_segment_read(inode, page, bofs, bix, level);
966 if (ret) {
967 logfs_put_write_page(page);
968 return 0;
969 }
970
971 bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level)));
972 logfs_put_write_page(page);
973 if (!bofs)
974 return 0;
975
976 if (pure_ofs(bofs) == ofs)
977 return 1;
978 }
979 return 0;
980}
981
982static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs)
983{
984 struct logfs_inode *li = logfs_inode(inode);
985 u64 bofs = li->li_data[INDIRECT_INDEX];
986
987 if (!bofs)
988 return 0;
989
990 if (bix >= maxbix(li->li_height))
991 return 0;
992
993 if (pure_ofs(bofs) == ofs)
994 return 1;
995
996 return __logfs_is_valid_loop(inode, bix, ofs, bofs);
997}
998
999static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
1000{
1001 struct logfs_inode *li = logfs_inode(inode);
1002
1003 if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1)
1004 return 0;
1005
1006 if (bix < I0_BLOCKS)
1007 return logfs_is_valid_direct(li, bix, ofs);
1008 return logfs_is_valid_loop(inode, bix, ofs);
1009}
1010
1011/**
1012 * logfs_is_valid_block - check whether this block is still valid
1013 *
1014 * @sb - superblock
1015 * @ofs - block physical offset
1016 * @ino - block inode number
1017 * @bix - block index
1018 * @level - block level
1019 *
1020 * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
1021 * become invalid once the journal is written.
1022 */
1023int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
1024 gc_level_t gc_level)
1025{
1026 struct logfs_super *super = logfs_super(sb);
1027 struct inode *inode;
1028 int ret, cookie;
1029
1030 /* Umount closes a segment with free blocks remaining. Those
1031 * blocks are by definition invalid. */
1032 if (ino == -1)
1033 return 0;
1034
1035 LOGFS_BUG_ON((u64)(u_long)ino != ino, sb);
1036
1037 inode = logfs_safe_iget(sb, ino, &cookie);
1038 if (IS_ERR(inode))
1039 goto invalid;
1040
1041 ret = __logfs_is_valid_block(inode, bix, ofs);
1042 logfs_safe_iput(inode, cookie);
1043 if (ret)
1044 return ret;
1045
1046invalid:
1047 /* Block is nominally invalid, but may still sit in the shadow tree,
1048 * waiting for a journal commit.
1049 */
1050 if (btree_lookup64(&super->s_shadow_tree.old, ofs))
1051 return 2;
1052 return 0;
1053}
1054
1055int logfs_readpage_nolock(struct page *page)
1056{
1057 struct inode *inode = page->mapping->host;
1058 int ret = -EIO;
1059
1060 ret = logfs_read_block(inode, page, READ);
1061
1062 if (ret) {
1063 ClearPageUptodate(page);
1064 SetPageError(page);
1065 } else {
1066 SetPageUptodate(page);
1067 ClearPageError(page);
1068 }
1069 flush_dcache_page(page);
1070
1071 return ret;
1072}
1073
1074static int logfs_reserve_bytes(struct inode *inode, int bytes)
1075{
1076 struct logfs_super *super = logfs_super(inode->i_sb);
1077 u64 available = super->s_free_bytes + super->s_dirty_free_bytes
1078 - super->s_dirty_used_bytes - super->s_dirty_pages;
1079
1080 if (!bytes)
1081 return 0;
1082
1083 if (available < bytes)
1084 return -ENOSPC;
1085
1086 if (available < bytes + super->s_root_reserve &&
1087 !capable(CAP_SYS_RESOURCE))
1088 return -ENOSPC;
1089
1090 return 0;
1091}
1092
1093int get_page_reserve(struct inode *inode, struct page *page)
1094{
1095 struct logfs_super *super = logfs_super(inode->i_sb);
1096 int ret;
1097
1098 if (logfs_block(page) && logfs_block(page)->reserved_bytes)
1099 return 0;
1100
1101 logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
1102 ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE);
1103 if (!ret) {
1104 alloc_data_block(inode, page);
1105 logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
1106 super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
1107 }
1108 logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
1109 return ret;
1110}
1111
1112/*
1113 * We are protected by write lock. Push victims up to superblock level
1114 * and release transaction when appropriate.
1115 */
1116/* FIXME: This is currently called from the wrong spots. */
1117static void logfs_handle_transaction(struct inode *inode,
1118 struct logfs_transaction *ta)
1119{
1120 struct logfs_super *super = logfs_super(inode->i_sb);
1121
1122 if (!ta)
1123 return;
1124 logfs_inode(inode)->li_block->ta = NULL;
1125
1126 if (inode->i_ino != LOGFS_INO_MASTER) {
1127 BUG(); /* FIXME: Yes, this needs more thought */
1128 /* just remember the transaction until inode is written */
1129 //BUG_ON(logfs_inode(inode)->li_transaction);
1130 //logfs_inode(inode)->li_transaction = ta;
1131 return;
1132 }
1133
1134 switch (ta->state) {
1135 case CREATE_1: /* fall through */
1136 case UNLINK_1:
1137 BUG_ON(super->s_victim_ino);
1138 super->s_victim_ino = ta->ino;
1139 break;
1140 case CREATE_2: /* fall through */
1141 case UNLINK_2:
1142 BUG_ON(super->s_victim_ino != ta->ino);
1143 super->s_victim_ino = 0;
1144 /* transaction ends here - free it */
1145 kfree(ta);
1146 break;
1147 case CROSS_RENAME_1:
1148 BUG_ON(super->s_rename_dir);
1149 BUG_ON(super->s_rename_pos);
1150 super->s_rename_dir = ta->dir;
1151 super->s_rename_pos = ta->pos;
1152 break;
1153 case CROSS_RENAME_2:
1154 BUG_ON(super->s_rename_dir != ta->dir);
1155 BUG_ON(super->s_rename_pos != ta->pos);
1156 super->s_rename_dir = 0;
1157 super->s_rename_pos = 0;
1158 kfree(ta);
1159 break;
1160 case TARGET_RENAME_1:
1161 BUG_ON(super->s_rename_dir);
1162 BUG_ON(super->s_rename_pos);
1163 BUG_ON(super->s_victim_ino);
1164 super->s_rename_dir = ta->dir;
1165 super->s_rename_pos = ta->pos;
1166 super->s_victim_ino = ta->ino;
1167 break;
1168 case TARGET_RENAME_2:
1169 BUG_ON(super->s_rename_dir != ta->dir);
1170 BUG_ON(super->s_rename_pos != ta->pos);
1171 BUG_ON(super->s_victim_ino != ta->ino);
1172 super->s_rename_dir = 0;
1173 super->s_rename_pos = 0;
1174 break;
1175 case TARGET_RENAME_3:
1176 BUG_ON(super->s_rename_dir);
1177 BUG_ON(super->s_rename_pos);
1178 BUG_ON(super->s_victim_ino != ta->ino);
1179 super->s_victim_ino = 0;
1180 kfree(ta);
1181 break;
1182 default:
1183 BUG();
1184 }
1185}
1186
1187/*
1188 * Not strictly a reservation, but rather a check that we still have enough
1189 * space to satisfy the write.
1190 */
1191static int logfs_reserve_blocks(struct inode *inode, int blocks)
1192{
1193 return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE);
1194}
1195
1196struct write_control {
1197 u64 ofs;
1198 long flags;
1199};
1200
1201static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix,
1202 level_t level, u64 old_ofs)
1203{
1204 struct logfs_super *super = logfs_super(inode->i_sb);
1205 struct logfs_shadow *shadow;
1206
1207 shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS);
1208 memset(shadow, 0, sizeof(*shadow));
1209 shadow->ino = inode->i_ino;
1210 shadow->bix = bix;
1211 shadow->gc_level = expand_level(inode->i_ino, level);
1212 shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED;
1213 return shadow;
1214}
1215
1216static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
1217{
1218 struct logfs_super *super = logfs_super(inode->i_sb);
1219
1220 mempool_free(shadow, super->s_shadow_pool);
1221}
1222
1223static void mark_segment(struct shadow_tree *tree, u32 segno)
1224{
1225 int err;
1226
1227 if (!btree_lookup32(&tree->segment_map, segno)) {
1228 err = btree_insert32(&tree->segment_map, segno, (void *)1,
1229 GFP_NOFS);
1230 BUG_ON(err);
1231 tree->no_shadowed_segments++;
1232 }
1233}
1234
1235/**
1236 * fill_shadow_tree - Propagate shadow tree changes due to a write
1237 * @inode: Inode owning the page
1238 * @page: Struct page that was written
1239 * @shadow: Shadow for the current write
1240 *
1241 * Writes in logfs can result in two semi-valid objects. The old object
1242 * is still valid as long as it can be reached by following pointers on
1243 * the medium. Only when writes propagate all the way up to the journal
1244 * has the new object safely replaced the old one.
1245 *
1246 * To handle this problem, a struct logfs_shadow is used to represent
1247 * every single write. It is attached to the indirect block, which is
1248 * marked dirty. When the indirect block is written, its shadows are
1249 * handed up to the next indirect block (or inode). Untimately they
1250 * will reach the master inode and be freed upon journal commit.
1251 *
1252 * This function handles a single step in the propagation. It adds the
1253 * shadow for the current write to the tree, along with any shadows in
1254 * the page's tree, in case it was an indirect block. If a page is
1255 * written, the inode parameter is left NULL, if an inode is written,
1256 * the page parameter is left NULL.
1257 */
1258static void fill_shadow_tree(struct inode *inode, struct page *page,
1259 struct logfs_shadow *shadow)
1260{
1261 struct logfs_super *super = logfs_super(inode->i_sb);
1262 struct logfs_block *block = logfs_block(page);
1263 struct shadow_tree *tree = &super->s_shadow_tree;
1264
1265 if (PagePrivate(page)) {
1266 if (block->alias_map)
1267 super->s_no_object_aliases -= bitmap_weight(
1268 block->alias_map, LOGFS_BLOCK_FACTOR);
1269 logfs_handle_transaction(inode, block->ta);
1270 block->ops->free_block(inode->i_sb, block);
1271 }
1272 if (shadow) {
1273 if (shadow->old_ofs)
1274 btree_insert64(&tree->old, shadow->old_ofs, shadow,
1275 GFP_NOFS);
1276 else
1277 btree_insert64(&tree->new, shadow->new_ofs, shadow,
1278 GFP_NOFS);
1279
1280 super->s_dirty_used_bytes += shadow->new_len;
1281 super->s_dirty_free_bytes += shadow->old_len;
1282 mark_segment(tree, shadow->old_ofs >> super->s_segshift);
1283 mark_segment(tree, shadow->new_ofs >> super->s_segshift);
1284 }
1285}
1286
1287static void logfs_set_alias(struct super_block *sb, struct logfs_block *block,
1288 long child_no)
1289{
1290 struct logfs_super *super = logfs_super(sb);
1291
1292 if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) {
1293 /* Aliases in the master inode are pointless. */
1294 return;
1295 }
1296
1297 if (!test_bit(child_no, block->alias_map)) {
1298 set_bit(child_no, block->alias_map);
1299 super->s_no_object_aliases++;
1300 }
1301 list_move_tail(&block->alias_list, &super->s_object_alias);
1302}
1303
1304/*
1305 * Object aliases can and often do change the size and occupied space of a
1306 * file. So not only do we have to change the pointers, we also have to
1307 * change inode->i_size and li->li_used_bytes. Which is done by setting
1308 * another two object aliases for the inode itself.
1309 */
1310static void set_iused(struct inode *inode, struct logfs_shadow *shadow)
1311{
1312 struct logfs_inode *li = logfs_inode(inode);
1313
1314 if (shadow->new_len == shadow->old_len)
1315 return;
1316
1317 alloc_inode_block(inode);
1318 li->li_used_bytes += shadow->new_len - shadow->old_len;
1319 __logfs_set_blocks(inode);
1320 logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS);
1321 logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS);
1322}
1323
1324static int logfs_write_i0(struct inode *inode, struct page *page,
1325 struct write_control *wc)
1326{
1327 struct logfs_shadow *shadow;
1328 u64 bix;
1329 level_t level;
1330 int full, err = 0;
1331
1332 logfs_unpack_index(page->index, &bix, &level);
1333 if (wc->ofs == 0)
1334 if (logfs_reserve_blocks(inode, 1))
1335 return -ENOSPC;
1336
1337 shadow = alloc_shadow(inode, bix, level, wc->ofs);
1338 if (wc->flags & WF_WRITE)
1339 err = logfs_segment_write(inode, page, shadow);
1340 if (wc->flags & WF_DELETE)
1341 logfs_segment_delete(inode, shadow);
1342 if (err) {
1343 free_shadow(inode, shadow);
1344 return err;
1345 }
1346
1347 set_iused(inode, shadow);
1348 full = 1;
1349 if (level != 0) {
1350 alloc_indirect_block(inode, page, 0);
1351 full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR;
1352 }
1353 fill_shadow_tree(inode, page, shadow);
1354 wc->ofs = shadow->new_ofs;
1355 if (wc->ofs && full)
1356 wc->ofs |= LOGFS_FULLY_POPULATED;
1357 return 0;
1358}
1359
1360static int logfs_write_direct(struct inode *inode, struct page *page,
1361 long flags)
1362{
1363 struct logfs_inode *li = logfs_inode(inode);
1364 struct write_control wc = {
1365 .ofs = li->li_data[page->index],
1366 .flags = flags,
1367 };
1368 int err;
1369
1370 alloc_inode_block(inode);
1371
1372 err = logfs_write_i0(inode, page, &wc);
1373 if (err)
1374 return err;
1375
1376 li->li_data[page->index] = wc.ofs;
1377 logfs_set_alias(inode->i_sb, li->li_block,
1378 page->index + INODE_POINTER_OFS);
1379 return 0;
1380}
1381
1382static int ptr_change(u64 ofs, struct page *page)
1383{
1384 struct logfs_block *block = logfs_block(page);
1385 int empty0, empty1, full0, full1;
1386
1387 empty0 = ofs == 0;
1388 empty1 = block->partial == 0;
1389 if (empty0 != empty1)
1390 return 1;
1391
1392 /* The !! is necessary to shrink result to int */
1393 full0 = !!(ofs & LOGFS_FULLY_POPULATED);
1394 full1 = block->full == LOGFS_BLOCK_FACTOR;
1395 if (full0 != full1)
1396 return 1;
1397 return 0;
1398}
1399
1400static int __logfs_write_rec(struct inode *inode, struct page *page,
1401 struct write_control *this_wc,
1402 pgoff_t bix, level_t target_level, level_t level)
1403{
1404 int ret, page_empty = 0;
1405 int child_no = get_bits(bix, SUBLEVEL(level));
1406 struct page *ipage;
1407 struct write_control child_wc = {
1408 .flags = this_wc->flags,
1409 };
1410
1411 ipage = logfs_get_write_page(inode, bix, level);
1412 if (!ipage)
1413 return -ENOMEM;
1414
1415 if (this_wc->ofs) {
1416 ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
1417 if (ret)
1418 goto out;
1419 } else if (!PageUptodate(ipage)) {
1420 page_empty = 1;
1421 logfs_read_empty(ipage);
1422 }
1423
1424 child_wc.ofs = block_get_pointer(ipage, child_no);
1425
1426 if ((__force u8)level-1 > (__force u8)target_level)
1427 ret = __logfs_write_rec(inode, page, &child_wc, bix,
1428 target_level, SUBLEVEL(level));
1429 else
1430 ret = logfs_write_i0(inode, page, &child_wc);
1431
1432 if (ret)
1433 goto out;
1434
1435 alloc_indirect_block(inode, ipage, page_empty);
1436 block_set_pointer(ipage, child_no, child_wc.ofs);
1437 /* FIXME: first condition seems superfluous */
1438 if (child_wc.ofs || logfs_block(ipage)->partial)
1439 this_wc->flags |= WF_WRITE;
1440 /* the condition on this_wc->ofs ensures that we won't consume extra
1441 * space for indirect blocks in the future, which we cannot reserve */
1442 if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage))
1443 ret = logfs_write_i0(inode, ipage, this_wc);
1444 else
1445 logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no);
1446out:
1447 logfs_put_write_page(ipage);
1448 return ret;
1449}
1450
1451static int logfs_write_rec(struct inode *inode, struct page *page,
1452 pgoff_t bix, level_t target_level, long flags)
1453{
1454 struct logfs_inode *li = logfs_inode(inode);
1455 struct write_control wc = {
1456 .ofs = li->li_data[INDIRECT_INDEX],
1457 .flags = flags,
1458 };
1459 int ret;
1460
1461 alloc_inode_block(inode);
1462
1463 if (li->li_height > (__force u8)target_level)
1464 ret = __logfs_write_rec(inode, page, &wc, bix, target_level,
1465 LEVEL(li->li_height));
1466 else
1467 ret = logfs_write_i0(inode, page, &wc);
1468 if (ret)
1469 return ret;
1470
1471 if (li->li_data[INDIRECT_INDEX] != wc.ofs) {
1472 li->li_data[INDIRECT_INDEX] = wc.ofs;
1473 logfs_set_alias(inode->i_sb, li->li_block,
1474 INDIRECT_INDEX + INODE_POINTER_OFS);
1475 }
1476 return ret;
1477}
1478
1479void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta)
1480{
1481 alloc_inode_block(inode);
1482 logfs_inode(inode)->li_block->ta = ta;
1483}
1484
1485void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta)
1486{
1487 struct logfs_block *block = logfs_inode(inode)->li_block;
1488
1489 if (block && block->ta)
1490 block->ta = NULL;
1491}
1492
1493static int grow_inode(struct inode *inode, u64 bix, level_t level)
1494{
1495 struct logfs_inode *li = logfs_inode(inode);
1496 u8 height = (__force u8)level;
1497 struct page *page;
1498 struct write_control wc = {
1499 .flags = WF_WRITE,
1500 };
1501 int err;
1502
1503 BUG_ON(height > 5 || li->li_height > 5);
1504 while (height > li->li_height || bix >= maxbix(li->li_height)) {
1505 page = logfs_get_write_page(inode, I0_BLOCKS + 1,
1506 LEVEL(li->li_height + 1));
1507 if (!page)
1508 return -ENOMEM;
1509 logfs_read_empty(page);
1510 alloc_indirect_block(inode, page, 1);
1511 block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]);
1512 err = logfs_write_i0(inode, page, &wc);
1513 logfs_put_write_page(page);
1514 if (err)
1515 return err;
1516 li->li_data[INDIRECT_INDEX] = wc.ofs;
1517 wc.ofs = 0;
1518 li->li_height++;
1519 logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS);
1520 }
1521 return 0;
1522}
1523
1524static int __logfs_write_buf(struct inode *inode, struct page *page, long flags)
1525{
1526 struct logfs_super *super = logfs_super(inode->i_sb);
1527 pgoff_t index = page->index;
1528 u64 bix;
1529 level_t level;
1530 int err;
1531
1532 flags |= WF_WRITE | WF_DELETE;
1533 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1534
1535 logfs_unpack_index(index, &bix, &level);
1536 if (logfs_block(page) && logfs_block(page)->reserved_bytes)
1537 super->s_dirty_pages -= logfs_block(page)->reserved_bytes;
1538
1539 if (index < I0_BLOCKS)
1540 return logfs_write_direct(inode, page, flags);
1541
1542 bix = adjust_bix(bix, level);
1543 err = grow_inode(inode, bix, level);
1544 if (err)
1545 return err;
1546 return logfs_write_rec(inode, page, bix, level, flags);
1547}
1548
1549int logfs_write_buf(struct inode *inode, struct page *page, long flags)
1550{
1551 struct super_block *sb = inode->i_sb;
1552 int ret;
1553
1554 logfs_get_wblocks(sb, page, flags & WF_LOCK);
1555 ret = __logfs_write_buf(inode, page, flags);
1556 logfs_put_wblocks(sb, page, flags & WF_LOCK);
1557 return ret;
1558}
1559
1560static int __logfs_delete(struct inode *inode, struct page *page)
1561{
1562 long flags = WF_DELETE;
1563
1564 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1565
1566 if (page->index < I0_BLOCKS)
1567 return logfs_write_direct(inode, page, flags);
1568 return logfs_write_rec(inode, page, page->index, 0, flags);
1569}
1570
1571int logfs_delete(struct inode *inode, pgoff_t index,
1572 struct shadow_tree *shadow_tree)
1573{
1574 struct super_block *sb = inode->i_sb;
1575 struct page *page;
1576 int ret;
1577
1578 page = logfs_get_read_page(inode, index, 0);
1579 if (!page)
1580 return -ENOMEM;
1581
1582 logfs_get_wblocks(sb, page, 1);
1583 ret = __logfs_delete(inode, page);
1584 logfs_put_wblocks(sb, page, 1);
1585
1586 logfs_put_read_page(page);
1587
1588 return ret;
1589}
1590
1591int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1592 gc_level_t gc_level, long flags)
1593{
1594 level_t level = shrink_level(gc_level);
1595 struct page *page;
1596 int err;
1597
1598 page = logfs_get_write_page(inode, bix, level);
1599 if (!page)
1600 return -ENOMEM;
1601
1602 err = logfs_segment_read(inode, page, ofs, bix, level);
1603 if (!err) {
1604 if (level != 0)
1605 alloc_indirect_block(inode, page, 0);
1606 err = logfs_write_buf(inode, page, flags);
1607 if (!err && shrink_level(gc_level) == 0) {
1608 /* Rewrite cannot mark the inode dirty but has to
1609 * write it immediatly.
1610 * Q: Can't we just create an alias for the inode
1611 * instead? And if not, why not?
1612 */
1613 if (inode->i_ino == LOGFS_INO_MASTER)
1614 logfs_write_anchor(inode->i_sb);
1615 else {
1616 err = __logfs_write_inode(inode, flags);
1617 }
1618 }
1619 }
1620 logfs_put_write_page(page);
1621 return err;
1622}
1623
1624static int truncate_data_block(struct inode *inode, struct page *page,
1625 u64 ofs, struct logfs_shadow *shadow, u64 size)
1626{
1627 loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits;
1628 u64 bix;
1629 level_t level;
1630 int err;
1631
1632 /* Does truncation happen within this page? */
1633 if (size <= pageofs || size - pageofs >= PAGE_SIZE)
1634 return 0;
1635
1636 logfs_unpack_index(page->index, &bix, &level);
1637 BUG_ON(level != 0);
1638
1639 err = logfs_segment_read(inode, page, ofs, bix, level);
1640 if (err)
1641 return err;
1642
1643 zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE);
1644 return logfs_segment_write(inode, page, shadow);
1645}
1646
1647static int logfs_truncate_i0(struct inode *inode, struct page *page,
1648 struct write_control *wc, u64 size)
1649{
1650 struct logfs_shadow *shadow;
1651 u64 bix;
1652 level_t level;
1653 int err = 0;
1654
1655 logfs_unpack_index(page->index, &bix, &level);
1656 BUG_ON(level != 0);
1657 shadow = alloc_shadow(inode, bix, level, wc->ofs);
1658
1659 err = truncate_data_block(inode, page, wc->ofs, shadow, size);
1660 if (err) {
1661 free_shadow(inode, shadow);
1662 return err;
1663 }
1664
1665 logfs_segment_delete(inode, shadow);
1666 set_iused(inode, shadow);
1667 fill_shadow_tree(inode, page, shadow);
1668 wc->ofs = shadow->new_ofs;
1669 return 0;
1670}
1671
1672static int logfs_truncate_direct(struct inode *inode, u64 size)
1673{
1674 struct logfs_inode *li = logfs_inode(inode);
1675 struct write_control wc;
1676 struct page *page;
1677 int e;
1678 int err;
1679
1680 alloc_inode_block(inode);
1681
1682 for (e = I0_BLOCKS - 1; e >= 0; e--) {
1683 if (size > (e+1) * LOGFS_BLOCKSIZE)
1684 break;
1685
1686 wc.ofs = li->li_data[e];
1687 if (!wc.ofs)
1688 continue;
1689
1690 page = logfs_get_write_page(inode, e, 0);
1691 if (!page)
1692 return -ENOMEM;
1693 err = logfs_segment_read(inode, page, wc.ofs, e, 0);
1694 if (err) {
1695 logfs_put_write_page(page);
1696 return err;
1697 }
1698 err = logfs_truncate_i0(inode, page, &wc, size);
1699 logfs_put_write_page(page);
1700 if (err)
1701 return err;
1702
1703 li->li_data[e] = wc.ofs;
1704 }
1705 return 0;
1706}
1707
1708/* FIXME: these need to become per-sb once we support different blocksizes */
1709static u64 __logfs_step[] = {
1710 1,
1711 I1_BLOCKS,
1712 I2_BLOCKS,
1713 I3_BLOCKS,
1714};
1715
1716static u64 __logfs_start_index[] = {
1717 I0_BLOCKS,
1718 I1_BLOCKS,
1719 I2_BLOCKS,
1720 I3_BLOCKS
1721};
1722
1723static inline u64 logfs_step(level_t level)
1724{
1725 return __logfs_step[(__force u8)level];
1726}
1727
1728static inline u64 logfs_factor(u8 level)
1729{
1730 return __logfs_step[level] * LOGFS_BLOCKSIZE;
1731}
1732
1733static inline u64 logfs_start_index(level_t level)
1734{
1735 return __logfs_start_index[(__force u8)level];
1736}
1737
1738static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level)
1739{
1740 logfs_unpack_index(index, bix, level);
1741 if (*bix <= logfs_start_index(SUBLEVEL(*level)))
1742 *bix = 0;
1743}
1744
1745static int __logfs_truncate_rec(struct inode *inode, struct page *ipage,
1746 struct write_control *this_wc, u64 size)
1747{
1748 int truncate_happened = 0;
1749 int e, err = 0;
1750 u64 bix, child_bix, next_bix;
1751 level_t level;
1752 struct page *page;
1753 struct write_control child_wc = { /* FIXME: flags */ };
1754
1755 logfs_unpack_raw_index(ipage->index, &bix, &level);
1756 err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
1757 if (err)
1758 return err;
1759
1760 for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) {
1761 child_bix = bix + e * logfs_step(SUBLEVEL(level));
1762 next_bix = child_bix + logfs_step(SUBLEVEL(level));
1763 if (size > next_bix * LOGFS_BLOCKSIZE)
1764 break;
1765
1766 child_wc.ofs = pure_ofs(block_get_pointer(ipage, e));
1767 if (!child_wc.ofs)
1768 continue;
1769
1770 page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level));
1771 if (!page)
1772 return -ENOMEM;
1773
1774 if ((__force u8)level > 1)
1775 err = __logfs_truncate_rec(inode, page, &child_wc, size);
1776 else
1777 err = logfs_truncate_i0(inode, page, &child_wc, size);
1778 logfs_put_write_page(page);
1779 if (err)
1780 return err;
1781
1782 truncate_happened = 1;
1783 alloc_indirect_block(inode, ipage, 0);
1784 block_set_pointer(ipage, e, child_wc.ofs);
1785 }
1786
1787 if (!truncate_happened) {
1788 printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size);
1789 return 0;
1790 }
1791
1792 this_wc->flags = WF_DELETE;
1793 if (logfs_block(ipage)->partial)
1794 this_wc->flags |= WF_WRITE;
1795
1796 return logfs_write_i0(inode, ipage, this_wc);
1797}
1798
1799static int logfs_truncate_rec(struct inode *inode, u64 size)
1800{
1801 struct logfs_inode *li = logfs_inode(inode);
1802 struct write_control wc = {
1803 .ofs = li->li_data[INDIRECT_INDEX],
1804 };
1805 struct page *page;
1806 int err;
1807
1808 alloc_inode_block(inode);
1809
1810 if (!wc.ofs)
1811 return 0;
1812
1813 page = logfs_get_write_page(inode, 0, LEVEL(li->li_height));
1814 if (!page)
1815 return -ENOMEM;
1816
1817 err = __logfs_truncate_rec(inode, page, &wc, size);
1818 logfs_put_write_page(page);
1819 if (err)
1820 return err;
1821
1822 if (li->li_data[INDIRECT_INDEX] != wc.ofs)
1823 li->li_data[INDIRECT_INDEX] = wc.ofs;
1824 return 0;
1825}
1826
1827static int __logfs_truncate(struct inode *inode, u64 size)
1828{
1829 int ret;
1830
1831 if (size >= logfs_factor(logfs_inode(inode)->li_height))
1832 return 0;
1833
1834 ret = logfs_truncate_rec(inode, size);
1835 if (ret)
1836 return ret;
1837
1838 return logfs_truncate_direct(inode, size);
1839}
1840
1841/*
1842 * Truncate, by changing the segment file, can consume a fair amount
1843 * of resources. So back off from time to time and do some GC.
1844 * 8 or 2048 blocks should be well within safety limits even if
1845 * every single block resided in a different segment.
1846 */
1847#define TRUNCATE_STEP (8 * 1024 * 1024)
1848int logfs_truncate(struct inode *inode, u64 target)
1849{
1850 struct super_block *sb = inode->i_sb;
1851 u64 size = i_size_read(inode);
1852 int err = 0;
1853
1854 size = ALIGN(size, TRUNCATE_STEP);
1855 while (size > target) {
1856 if (size > TRUNCATE_STEP)
1857 size -= TRUNCATE_STEP;
1858 else
1859 size = 0;
1860 if (size < target)
1861 size = target;
1862
1863 logfs_get_wblocks(sb, NULL, 1);
1864 err = __logfs_truncate(inode, target);
1865 if (!err)
1866 err = __logfs_write_inode(inode, 0);
1867 logfs_put_wblocks(sb, NULL, 1);
1868 }
1869
1870 if (!err)
1871 err = vmtruncate(inode, target);
1872
1873 /* I don't trust error recovery yet. */
1874 WARN_ON(err);
1875 return err;
1876}
1877
1878static void move_page_to_inode(struct inode *inode, struct page *page)
1879{
1880 struct logfs_inode *li = logfs_inode(inode);
1881 struct logfs_block *block = logfs_block(page);
1882
1883 if (!block)
1884 return;
1885
1886 log_blockmove("move_page_to_inode(%llx, %llx, %x)\n",
1887 block->ino, block->bix, block->level);
1888 BUG_ON(li->li_block);
1889 block->ops = &inode_block_ops;
1890 block->inode = inode;
1891 li->li_block = block;
1892
1893 block->page = NULL;
1894 page->private = 0;
1895 ClearPagePrivate(page);
1896}
1897
1898static void move_inode_to_page(struct page *page, struct inode *inode)
1899{
1900 struct logfs_inode *li = logfs_inode(inode);
1901 struct logfs_block *block = li->li_block;
1902
1903 if (!block)
1904 return;
1905
1906 log_blockmove("move_inode_to_page(%llx, %llx, %x)\n",
1907 block->ino, block->bix, block->level);
1908 BUG_ON(PagePrivate(page));
1909 block->ops = &indirect_block_ops;
1910 block->page = page;
1911 page->private = (unsigned long)block;
1912 SetPagePrivate(page);
1913
1914 block->inode = NULL;
1915 li->li_block = NULL;
1916}
1917
1918int logfs_read_inode(struct inode *inode)
1919{
1920 struct super_block *sb = inode->i_sb;
1921 struct logfs_super *super = logfs_super(sb);
1922 struct inode *master_inode = super->s_master_inode;
1923 struct page *page;
1924 struct logfs_disk_inode *di;
1925 u64 ino = inode->i_ino;
1926
1927 if (ino << sb->s_blocksize_bits > i_size_read(master_inode))
1928 return -ENODATA;
1929 if (!logfs_exist_block(master_inode, ino))
1930 return -ENODATA;
1931
1932 page = read_cache_page(master_inode->i_mapping, ino,
1933 (filler_t *)logfs_readpage, NULL);
1934 if (IS_ERR(page))
1935 return PTR_ERR(page);
1936
1937 di = kmap_atomic(page, KM_USER0);
1938 logfs_disk_to_inode(di, inode);
1939 kunmap_atomic(di, KM_USER0);
1940 move_page_to_inode(inode, page);
1941 page_cache_release(page);
1942 return 0;
1943}
1944
1945/* Caller must logfs_put_write_page(page); */
1946static struct page *inode_to_page(struct inode *inode)
1947{
1948 struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode;
1949 struct logfs_disk_inode *di;
1950 struct page *page;
1951
1952 BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
1953
1954 page = logfs_get_write_page(master_inode, inode->i_ino, 0);
1955 if (!page)
1956 return NULL;
1957
1958 di = kmap_atomic(page, KM_USER0);
1959 logfs_inode_to_disk(inode, di);
1960 kunmap_atomic(di, KM_USER0);
1961 move_inode_to_page(page, inode);
1962 return page;
1963}
1964
1965/* Cheaper version of write_inode. All changes are concealed in
1966 * aliases, which are moved back. No write to the medium happens.
1967 */
1968void logfs_clear_inode(struct inode *inode)
1969{
1970 struct super_block *sb = inode->i_sb;
1971 struct logfs_inode *li = logfs_inode(inode);
1972 struct logfs_block *block = li->li_block;
1973 struct page *page;
1974
1975 /* Only deleted files may be dirty at this point */
1976 BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
1977 if (!block)
1978 return;
1979 if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
1980 block->ops->free_block(inode->i_sb, block);
1981 return;
1982 }
1983
1984 BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
1985 page = inode_to_page(inode);
1986 BUG_ON(!page); /* FIXME: Use emergency page */
1987 logfs_put_write_page(page);
1988}
1989
1990static int do_write_inode(struct inode *inode)
1991{
1992 struct super_block *sb = inode->i_sb;
1993 struct inode *master_inode = logfs_super(sb)->s_master_inode;
1994 loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits;
1995 struct page *page;
1996 int err;
1997
1998 BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
1999 /* FIXME: lock inode */
2000
2001 if (i_size_read(master_inode) < size)
2002 i_size_write(master_inode, size);
2003
2004 /* TODO: Tell vfs this inode is clean now */
2005
2006 page = inode_to_page(inode);
2007 if (!page)
2008 return -ENOMEM;
2009
2010 /* FIXME: transaction is part of logfs_block now. Is that enough? */
2011 err = logfs_write_buf(master_inode, page, 0);
2012 logfs_put_write_page(page);
2013 return err;
2014}
2015
2016static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,
2017 int write,
2018 void (*change_se)(struct logfs_segment_entry *, long),
2019 long arg)
2020{
2021 struct logfs_super *super = logfs_super(sb);
2022 struct inode *inode;
2023 struct page *page;
2024 struct logfs_segment_entry *se;
2025 pgoff_t page_no;
2026 int child_no;
2027
2028 page_no = segno >> (sb->s_blocksize_bits - 3);
2029 child_no = segno & ((sb->s_blocksize >> 3) - 1);
2030
2031 inode = super->s_segfile_inode;
2032 page = logfs_get_write_page(inode, page_no, 0);
2033 BUG_ON(!page); /* FIXME: We need some reserve page for this case */
2034 if (!PageUptodate(page))
2035 logfs_read_block(inode, page, WRITE);
2036
2037 if (write)
2038 alloc_indirect_block(inode, page, 0);
2039 se = kmap_atomic(page, KM_USER0);
2040 change_se(se + child_no, arg);
2041 if (write) {
2042 logfs_set_alias(sb, logfs_block(page), child_no);
2043 BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);
2044 }
2045 kunmap_atomic(se, KM_USER0);
2046
2047 logfs_put_write_page(page);
2048}
2049
2050static void __get_segment_entry(struct logfs_segment_entry *se, long _target)
2051{
2052 struct logfs_segment_entry *target = (void *)_target;
2053
2054 *target = *se;
2055}
2056
2057void logfs_get_segment_entry(struct super_block *sb, u32 segno,
2058 struct logfs_segment_entry *se)
2059{
2060 logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se);
2061}
2062
2063static void __set_segment_used(struct logfs_segment_entry *se, long increment)
2064{
2065 u32 valid;
2066
2067 valid = be32_to_cpu(se->valid);
2068 valid += increment;
2069 se->valid = cpu_to_be32(valid);
2070}
2071
2072void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment)
2073{
2074 struct logfs_super *super = logfs_super(sb);
2075 u32 segno = ofs >> super->s_segshift;
2076
2077 if (!increment)
2078 return;
2079
2080 logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment);
2081}
2082
2083static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level)
2084{
2085 se->ec_level = cpu_to_be32(ec_level);
2086}
2087
2088void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
2089 gc_level_t gc_level)
2090{
2091 u32 ec_level = ec << 4 | (__force u8)gc_level;
2092
2093 logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level);
2094}
2095
2096static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore)
2097{
2098 se->valid = cpu_to_be32(RESERVED);
2099}
2100
2101void logfs_set_segment_reserved(struct super_block *sb, u32 segno)
2102{
2103 logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0);
2104}
2105
2106static void __set_segment_unreserved(struct logfs_segment_entry *se,
2107 long ec_level)
2108{
2109 se->valid = 0;
2110 se->ec_level = cpu_to_be32(ec_level);
2111}
2112
2113void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
2114{
2115 u32 ec_level = ec << 4;
2116
2117 logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved,
2118 ec_level);
2119}
2120
2121int __logfs_write_inode(struct inode *inode, long flags)
2122{
2123 struct super_block *sb = inode->i_sb;
2124 int ret;
2125
2126 logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
2127 ret = do_write_inode(inode);
2128 logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
2129 return ret;
2130}
2131
2132static int do_delete_inode(struct inode *inode)
2133{
2134 struct super_block *sb = inode->i_sb;
2135 struct inode *master_inode = logfs_super(sb)->s_master_inode;
2136 struct page *page;
2137 int ret;
2138
2139 page = logfs_get_write_page(master_inode, inode->i_ino, 0);
2140 if (!page)
2141 return -ENOMEM;
2142
2143 move_inode_to_page(page, inode);
2144
2145 logfs_get_wblocks(sb, page, 1);
2146 ret = __logfs_delete(master_inode, page);
2147 logfs_put_wblocks(sb, page, 1);
2148
2149 logfs_put_write_page(page);
2150 return ret;
2151}
2152
2153/*
2154 * ZOMBIE inodes have already been deleted before and should remain dead,
2155 * if it weren't for valid checking. No need to kill them again here.
2156 */
2157void logfs_delete_inode(struct inode *inode)
2158{
2159 struct logfs_inode *li = logfs_inode(inode);
2160
2161 if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
2162 li->li_flags |= LOGFS_IF_ZOMBIE;
2163 if (i_size_read(inode) > 0)
2164 logfs_truncate(inode, 0);
2165 do_delete_inode(inode);
2166 }
2167 truncate_inode_pages(&inode->i_data, 0);
2168 clear_inode(inode);
2169}
2170
2171void btree_write_block(struct logfs_block *block)
2172{
2173 struct inode *inode;
2174 struct page *page;
2175 int err, cookie;
2176
2177 inode = logfs_safe_iget(block->sb, block->ino, &cookie);
2178 page = logfs_get_write_page(inode, block->bix, block->level);
2179
2180 err = logfs_readpage_nolock(page);
2181 BUG_ON(err);
2182 BUG_ON(!PagePrivate(page));
2183 BUG_ON(logfs_block(page) != block);
2184 err = __logfs_write_buf(inode, page, 0);
2185 BUG_ON(err);
2186 BUG_ON(PagePrivate(page) || page->private);
2187
2188 logfs_put_write_page(page);
2189 logfs_safe_iput(inode, cookie);
2190}
2191
2192/**
2193 * logfs_inode_write - write inode or dentry objects
2194 *
2195 * @inode: parent inode (ifile or directory)
2196 * @buf: object to write (inode or dentry)
2197 * @n: object size
2198 * @_pos: object number (file position in blocks/objects)
2199 * @flags: write flags
2200 * @lock: 0 if write lock is already taken, 1 otherwise
2201 * @shadow_tree: shadow below this inode
2202 *
2203 * FIXME: All caller of this put a 200-300 byte variable on the stack,
2204 * only to call here and do a memcpy from that stack variable. A good
2205 * example of wasted performance and stack space.
2206 */
2207int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
2208 loff_t bix, long flags, struct shadow_tree *shadow_tree)
2209{
2210 loff_t pos = bix << inode->i_sb->s_blocksize_bits;
2211 int err;
2212 struct page *page;
2213 void *pagebuf;
2214
2215 BUG_ON(pos & (LOGFS_BLOCKSIZE-1));
2216 BUG_ON(count > LOGFS_BLOCKSIZE);
2217 page = logfs_get_write_page(inode, bix, 0);
2218 if (!page)
2219 return -ENOMEM;
2220
2221 pagebuf = kmap_atomic(page, KM_USER0);
2222 memcpy(pagebuf, buf, count);
2223 flush_dcache_page(page);
2224 kunmap_atomic(pagebuf, KM_USER0);
2225
2226 if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
2227 i_size_write(inode, pos + LOGFS_BLOCKSIZE);
2228
2229 err = logfs_write_buf(inode, page, flags);
2230 logfs_put_write_page(page);
2231 return err;
2232}
2233
2234int logfs_open_segfile(struct super_block *sb)
2235{
2236 struct logfs_super *super = logfs_super(sb);
2237 struct inode *inode;
2238
2239 inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE);
2240 if (IS_ERR(inode))
2241 return PTR_ERR(inode);
2242 super->s_segfile_inode = inode;
2243 return 0;
2244}
2245
2246int logfs_init_rw(struct super_block *sb)
2247{
2248 struct logfs_super *super = logfs_super(sb);
2249 int min_fill = 3 * super->s_no_blocks;
2250
2251 INIT_LIST_HEAD(&super->s_object_alias);
2252 mutex_init(&super->s_write_mutex);
2253 super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
2254 sizeof(struct logfs_block));
2255 super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill,
2256 sizeof(struct logfs_shadow));
2257 return 0;
2258}
2259
2260void logfs_cleanup_rw(struct super_block *sb)
2261{
2262 struct logfs_super *super = logfs_super(sb);
2263
2264 destroy_meta_inode(super->s_segfile_inode);
2265 logfs_mempool_destroy(super->s_block_pool);
2266 logfs_mempool_destroy(super->s_shadow_pool);
2267}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
new file mode 100644
index 000000000000..f77ce2b470ba
--- /dev/null
+++ b/fs/logfs/segment.c
@@ -0,0 +1,930 @@
1/*
2 * fs/logfs/segment.c - Handling the Object Store
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Object store or ostore makes up the complete device with exception of
9 * the superblock and journal areas. Apart from its own metadata it stores
10 * three kinds of objects: inodes, dentries and blocks, both data and indirect.
11 */
12#include "logfs.h"
13#include <linux/slab.h>
14
15static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
16{
17 struct logfs_super *super = logfs_super(sb);
18 struct btree_head32 *head = &super->s_reserved_segments;
19 int err;
20
21 err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
22 if (err)
23 return err;
24 logfs_super(sb)->s_bad_segments++;
25 /* FIXME: write to journal */
26 return 0;
27}
28
29int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase)
30{
31 struct logfs_super *super = logfs_super(sb);
32
33 super->s_gec++;
34
35 return super->s_devops->erase(sb, (u64)segno << super->s_segshift,
36 super->s_segsize, ensure_erase);
37}
38
39static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes)
40{
41 s32 ofs;
42
43 logfs_open_area(area, bytes);
44
45 ofs = area->a_used_bytes;
46 area->a_used_bytes += bytes;
47 BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize);
48
49 return dev_ofs(area->a_sb, area->a_segno, ofs);
50}
51
52static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
53 int use_filler)
54{
55 struct logfs_super *super = logfs_super(sb);
56 struct address_space *mapping = super->s_mapping_inode->i_mapping;
57 filler_t *filler = super->s_devops->readpage;
58 struct page *page;
59
60 BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
61 if (use_filler)
62 page = read_cache_page(mapping, index, filler, sb);
63 else {
64 page = find_or_create_page(mapping, index, GFP_NOFS);
65 unlock_page(page);
66 }
67 return page;
68}
69
70void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
71 int use_filler)
72{
73 pgoff_t index = ofs >> PAGE_SHIFT;
74 struct page *page;
75 long offset = ofs & (PAGE_SIZE-1);
76 long copylen;
77
78 /* Only logfs_wbuf_recover may use len==0 */
79 BUG_ON(!len && !use_filler);
80 do {
81 copylen = min((ulong)len, PAGE_SIZE - offset);
82
83 page = get_mapping_page(area->a_sb, index, use_filler);
84 SetPageUptodate(page);
85 BUG_ON(!page); /* FIXME: reserve a pool */
86 memcpy(page_address(page) + offset, buf, copylen);
87 SetPagePrivate(page);
88 page_cache_release(page);
89
90 buf += copylen;
91 len -= copylen;
92 offset = 0;
93 index++;
94 } while (len);
95}
96
97static void pad_partial_page(struct logfs_area *area)
98{
99 struct super_block *sb = area->a_sb;
100 struct page *page;
101 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
102 pgoff_t index = ofs >> PAGE_SHIFT;
103 long offset = ofs & (PAGE_SIZE-1);
104 u32 len = PAGE_SIZE - offset;
105
106 if (len % PAGE_SIZE) {
107 page = get_mapping_page(sb, index, 0);
108 BUG_ON(!page); /* FIXME: reserve a pool */
109 memset(page_address(page) + offset, 0xff, len);
110 SetPagePrivate(page);
111 page_cache_release(page);
112 }
113}
114
115static void pad_full_pages(struct logfs_area *area)
116{
117 struct super_block *sb = area->a_sb;
118 struct logfs_super *super = logfs_super(sb);
119 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
120 u32 len = super->s_segsize - area->a_used_bytes;
121 pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
122 pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
123 struct page *page;
124
125 while (no_indizes) {
126 page = get_mapping_page(sb, index, 0);
127 BUG_ON(!page); /* FIXME: reserve a pool */
128 SetPageUptodate(page);
129 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
130 SetPagePrivate(page);
131 page_cache_release(page);
132 index++;
133 no_indizes--;
134 }
135}
136
137/*
138 * bdev_writeseg will write full pages. Memset the tail to prevent data leaks.
139 * Also make sure we allocate (and memset) all pages for final writeout.
140 */
141static void pad_wbuf(struct logfs_area *area, int final)
142{
143 pad_partial_page(area);
144 if (final)
145 pad_full_pages(area);
146}
147
148/*
149 * We have to be careful with the alias tree. Since lookup is done by bix,
150 * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
151 * indirect blocks. So always use it through accessor functions.
152 */
153static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix,
154 level_t level)
155{
156 struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
157 pgoff_t index = logfs_pack_index(bix, level);
158
159 return btree_lookup128(head, ino, index);
160}
161
162static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix,
163 level_t level, void *val)
164{
165 struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
166 pgoff_t index = logfs_pack_index(bix, level);
167
168 return btree_insert128(head, ino, index, val, GFP_NOFS);
169}
170
171static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
172 write_alias_t *write_one_alias)
173{
174 struct object_alias_item *item;
175 int err;
176
177 list_for_each_entry(item, &block->item_list, list) {
178 err = write_alias_journal(sb, block->ino, block->bix,
179 block->level, item->child_no, item->val);
180 if (err)
181 return err;
182 }
183 return 0;
184}
185
186static struct logfs_block_ops btree_block_ops = {
187 .write_block = btree_write_block,
188 .free_block = __free_block,
189 .write_alias = btree_write_alias,
190};
191
192int logfs_load_object_aliases(struct super_block *sb,
193 struct logfs_obj_alias *oa, int count)
194{
195 struct logfs_super *super = logfs_super(sb);
196 struct logfs_block *block;
197 struct object_alias_item *item;
198 u64 ino, bix;
199 level_t level;
200 int i, err;
201
202 super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
203 count /= sizeof(*oa);
204 for (i = 0; i < count; i++) {
205 item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
206 if (!item)
207 return -ENOMEM;
208 memset(item, 0, sizeof(*item));
209
210 super->s_no_object_aliases++;
211 item->val = oa[i].val;
212 item->child_no = be16_to_cpu(oa[i].child_no);
213
214 ino = be64_to_cpu(oa[i].ino);
215 bix = be64_to_cpu(oa[i].bix);
216 level = LEVEL(oa[i].level);
217
218 log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n",
219 ino, bix, level, item->child_no,
220 be64_to_cpu(item->val));
221 block = alias_tree_lookup(sb, ino, bix, level);
222 if (!block) {
223 block = __alloc_block(sb, ino, bix, level);
224 block->ops = &btree_block_ops;
225 err = alias_tree_insert(sb, ino, bix, level, block);
226 BUG_ON(err); /* mempool empty */
227 }
228 if (test_and_set_bit(item->child_no, block->alias_map)) {
229 printk(KERN_ERR"LogFS: Alias collision detected\n");
230 return -EIO;
231 }
232 list_move_tail(&block->alias_list, &super->s_object_alias);
233 list_add(&item->list, &block->item_list);
234 }
235 return 0;
236}
237
238static void kill_alias(void *_block, unsigned long ignore0,
239 u64 ignore1, u64 ignore2, size_t ignore3)
240{
241 struct logfs_block *block = _block;
242 struct super_block *sb = block->sb;
243 struct logfs_super *super = logfs_super(sb);
244 struct object_alias_item *item;
245
246 while (!list_empty(&block->item_list)) {
247 item = list_entry(block->item_list.next, typeof(*item), list);
248 list_del(&item->list);
249 mempool_free(item, super->s_alias_pool);
250 }
251 block->ops->free_block(sb, block);
252}
253
254static int obj_type(struct inode *inode, level_t level)
255{
256 if (level == 0) {
257 if (S_ISDIR(inode->i_mode))
258 return OBJ_DENTRY;
259 if (inode->i_ino == LOGFS_INO_MASTER)
260 return OBJ_INODE;
261 }
262 return OBJ_BLOCK;
263}
264
265static int obj_len(struct super_block *sb, int obj_type)
266{
267 switch (obj_type) {
268 case OBJ_DENTRY:
269 return sizeof(struct logfs_disk_dentry);
270 case OBJ_INODE:
271 return sizeof(struct logfs_disk_inode);
272 case OBJ_BLOCK:
273 return sb->s_blocksize;
274 default:
275 BUG();
276 }
277}
278
279static int __logfs_segment_write(struct inode *inode, void *buf,
280 struct logfs_shadow *shadow, int type, int len, int compr)
281{
282 struct logfs_area *area;
283 struct super_block *sb = inode->i_sb;
284 s64 ofs;
285 struct logfs_object_header h;
286 int acc_len;
287
288 if (shadow->gc_level == 0)
289 acc_len = len;
290 else
291 acc_len = obj_len(sb, type);
292
293 area = get_area(sb, shadow->gc_level);
294 ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE);
295 LOGFS_BUG_ON(ofs <= 0, sb);
296 /*
297 * Order is important. logfs_get_free_bytes(), by modifying the
298 * segment file, may modify the content of the very page we're about
299 * to write now. Which is fine, as long as the calculated crc and
300 * written data still match. So do the modifications _before_
301 * calculating the crc.
302 */
303
304 h.len = cpu_to_be16(len);
305 h.type = type;
306 h.compr = compr;
307 h.ino = cpu_to_be64(inode->i_ino);
308 h.bix = cpu_to_be64(shadow->bix);
309 h.crc = logfs_crc32(&h, sizeof(h) - 4, 4);
310 h.data_crc = logfs_crc32(buf, len, 0);
311
312 logfs_buf_write(area, ofs, &h, sizeof(h));
313 logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len);
314
315 shadow->new_ofs = ofs;
316 shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE;
317
318 return 0;
319}
320
321static s64 logfs_segment_write_compress(struct inode *inode, void *buf,
322 struct logfs_shadow *shadow, int type, int len)
323{
324 struct super_block *sb = inode->i_sb;
325 void *compressor_buf = logfs_super(sb)->s_compressed_je;
326 ssize_t compr_len;
327 int ret;
328
329 mutex_lock(&logfs_super(sb)->s_journal_mutex);
330 compr_len = logfs_compress(buf, compressor_buf, len, len);
331
332 if (compr_len >= 0) {
333 ret = __logfs_segment_write(inode, compressor_buf, shadow,
334 type, compr_len, COMPR_ZLIB);
335 } else {
336 ret = __logfs_segment_write(inode, buf, shadow, type, len,
337 COMPR_NONE);
338 }
339 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
340 return ret;
341}
342
343/**
344 * logfs_segment_write - write data block to object store
345 * @inode: inode containing data
346 *
347 * Returns an errno or zero.
348 */
349int logfs_segment_write(struct inode *inode, struct page *page,
350 struct logfs_shadow *shadow)
351{
352 struct super_block *sb = inode->i_sb;
353 struct logfs_super *super = logfs_super(sb);
354 int do_compress, type, len;
355 int ret;
356 void *buf;
357
358 super->s_flags |= LOGFS_SB_FLAG_DIRTY;
359 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
360 do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED;
361 if (shadow->gc_level != 0) {
362 /* temporarily disable compression for indirect blocks */
363 do_compress = 0;
364 }
365
366 type = obj_type(inode, shrink_level(shadow->gc_level));
367 len = obj_len(sb, type);
368 buf = kmap(page);
369 if (do_compress)
370 ret = logfs_segment_write_compress(inode, buf, shadow, type,
371 len);
372 else
373 ret = __logfs_segment_write(inode, buf, shadow, type, len,
374 COMPR_NONE);
375 kunmap(page);
376
377 log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n",
378 shadow->ino, shadow->bix, shadow->gc_level,
379 shadow->old_ofs, shadow->new_ofs,
380 shadow->old_len, shadow->new_len);
381 /* this BUG_ON did catch a locking bug. useful */
382 BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1)));
383 return ret;
384}
385
386int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
387{
388 pgoff_t index = ofs >> PAGE_SHIFT;
389 struct page *page;
390 long offset = ofs & (PAGE_SIZE-1);
391 long copylen;
392
393 while (len) {
394 copylen = min((ulong)len, PAGE_SIZE - offset);
395
396 page = get_mapping_page(sb, index, 1);
397 if (IS_ERR(page))
398 return PTR_ERR(page);
399 memcpy(buf, page_address(page) + offset, copylen);
400 page_cache_release(page);
401
402 buf += copylen;
403 len -= copylen;
404 offset = 0;
405 index++;
406 }
407 return 0;
408}
409
410/*
411 * The "position" of indirect blocks is ambiguous. It can be the position
412 * of any data block somewhere behind this indirect block. So we need to
413 * normalize the positions through logfs_block_mask() before comparing.
414 */
415static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level)
416{
417 return (pos1 & logfs_block_mask(sb, level)) !=
418 (pos2 & logfs_block_mask(sb, level));
419}
420
421#if 0
422static int read_seg_header(struct super_block *sb, u64 ofs,
423 struct logfs_segment_header *sh)
424{
425 __be32 crc;
426 int err;
427
428 err = wbuf_read(sb, ofs, sizeof(*sh), sh);
429 if (err)
430 return err;
431 crc = logfs_crc32(sh, sizeof(*sh), 4);
432 if (crc != sh->crc) {
433 printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
434 "got %x\n", ofs, be32_to_cpu(sh->crc),
435 be32_to_cpu(crc));
436 return -EIO;
437 }
438 return 0;
439}
440#endif
441
442static int read_obj_header(struct super_block *sb, u64 ofs,
443 struct logfs_object_header *oh)
444{
445 __be32 crc;
446 int err;
447
448 err = wbuf_read(sb, ofs, sizeof(*oh), oh);
449 if (err)
450 return err;
451 crc = logfs_crc32(oh, sizeof(*oh) - 4, 4);
452 if (crc != oh->crc) {
453 printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
454 "got %x\n", ofs, be32_to_cpu(oh->crc),
455 be32_to_cpu(crc));
456 return -EIO;
457 }
458 return 0;
459}
460
461static void move_btree_to_page(struct inode *inode, struct page *page,
462 __be64 *data)
463{
464 struct super_block *sb = inode->i_sb;
465 struct logfs_super *super = logfs_super(sb);
466 struct btree_head128 *head = &super->s_object_alias_tree;
467 struct logfs_block *block;
468 struct object_alias_item *item, *next;
469
470 if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS))
471 return;
472
473 block = btree_remove128(head, inode->i_ino, page->index);
474 if (!block)
475 return;
476
477 log_blockmove("move_btree_to_page(%llx, %llx, %x)\n",
478 block->ino, block->bix, block->level);
479 list_for_each_entry_safe(item, next, &block->item_list, list) {
480 data[item->child_no] = item->val;
481 list_del(&item->list);
482 mempool_free(item, super->s_alias_pool);
483 }
484 block->page = page;
485 SetPagePrivate(page);
486 page->private = (unsigned long)block;
487 block->ops = &indirect_block_ops;
488 initialize_block_counters(page, block, data, 0);
489}
490
491/*
492 * This silences a false, yet annoying gcc warning. I hate it when my editor
493 * jumps into bitops.h each time I recompile this file.
494 * TODO: Complain to gcc folks about this and upgrade compiler.
495 */
496static unsigned long fnb(const unsigned long *addr,
497 unsigned long size, unsigned long offset)
498{
499 return find_next_bit(addr, size, offset);
500}
501
502void move_page_to_btree(struct page *page)
503{
504 struct logfs_block *block = logfs_block(page);
505 struct super_block *sb = block->sb;
506 struct logfs_super *super = logfs_super(sb);
507 struct object_alias_item *item;
508 unsigned long pos;
509 __be64 *child;
510 int err;
511
512 if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) {
513 block->ops->free_block(sb, block);
514 return;
515 }
516 log_blockmove("move_page_to_btree(%llx, %llx, %x)\n",
517 block->ino, block->bix, block->level);
518 super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
519
520 for (pos = 0; ; pos++) {
521 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
522 if (pos >= LOGFS_BLOCK_FACTOR)
523 break;
524
525 item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
526 BUG_ON(!item); /* mempool empty */
527 memset(item, 0, sizeof(*item));
528
529 child = kmap_atomic(page, KM_USER0);
530 item->val = child[pos];
531 kunmap_atomic(child, KM_USER0);
532 item->child_no = pos;
533 list_add(&item->list, &block->item_list);
534 }
535 block->page = NULL;
536 ClearPagePrivate(page);
537 page->private = 0;
538 block->ops = &btree_block_ops;
539 err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
540 block);
541 BUG_ON(err); /* mempool empty */
542 ClearPageUptodate(page);
543}
544
545static int __logfs_segment_read(struct inode *inode, void *buf,
546 u64 ofs, u64 bix, level_t level)
547{
548 struct super_block *sb = inode->i_sb;
549 void *compressor_buf = logfs_super(sb)->s_compressed_je;
550 struct logfs_object_header oh;
551 __be32 crc;
552 u16 len;
553 int err, block_len;
554
555 block_len = obj_len(sb, obj_type(inode, level));
556 err = read_obj_header(sb, ofs, &oh);
557 if (err)
558 goto out_err;
559
560 err = -EIO;
561 if (be64_to_cpu(oh.ino) != inode->i_ino
562 || check_pos(sb, be64_to_cpu(oh.bix), bix, level)) {
563 printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: "
564 "expected (%lx, %llx), got (%llx, %llx)\n",
565 ofs, inode->i_ino, bix,
566 be64_to_cpu(oh.ino), be64_to_cpu(oh.bix));
567 goto out_err;
568 }
569
570 len = be16_to_cpu(oh.len);
571
572 switch (oh.compr) {
573 case COMPR_NONE:
574 err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf);
575 if (err)
576 goto out_err;
577 crc = logfs_crc32(buf, len, 0);
578 if (crc != oh.data_crc) {
579 printk(KERN_ERR"LOGFS: uncompressed data crc error at "
580 "%llx: expected %x, got %x\n", ofs,
581 be32_to_cpu(oh.data_crc),
582 be32_to_cpu(crc));
583 goto out_err;
584 }
585 break;
586 case COMPR_ZLIB:
587 mutex_lock(&logfs_super(sb)->s_journal_mutex);
588 err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len,
589 compressor_buf);
590 if (err) {
591 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
592 goto out_err;
593 }
594 crc = logfs_crc32(compressor_buf, len, 0);
595 if (crc != oh.data_crc) {
596 printk(KERN_ERR"LOGFS: compressed data crc error at "
597 "%llx: expected %x, got %x\n", ofs,
598 be32_to_cpu(oh.data_crc),
599 be32_to_cpu(crc));
600 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
601 goto out_err;
602 }
603 err = logfs_uncompress(compressor_buf, buf, len, block_len);
604 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
605 if (err) {
606 printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs);
607 goto out_err;
608 }
609 break;
610 default:
611 LOGFS_BUG(sb);
612 err = -EIO;
613 goto out_err;
614 }
615 return 0;
616
617out_err:
618 logfs_set_ro(sb);
619 printk(KERN_ERR"LOGFS: device is read-only now\n");
620 LOGFS_BUG(sb);
621 return err;
622}
623
624/**
625 * logfs_segment_read - read data block from object store
626 * @inode: inode containing data
627 * @buf: data buffer
628 * @ofs: physical data offset
629 * @bix: block index
630 * @level: block level
631 *
632 * Returns 0 on success or a negative errno.
633 */
634int logfs_segment_read(struct inode *inode, struct page *page,
635 u64 ofs, u64 bix, level_t level)
636{
637 int err;
638 void *buf;
639
640 if (PageUptodate(page))
641 return 0;
642
643 ofs &= ~LOGFS_FULLY_POPULATED;
644
645 buf = kmap(page);
646 err = __logfs_segment_read(inode, buf, ofs, bix, level);
647 if (!err) {
648 move_btree_to_page(inode, page, buf);
649 SetPageUptodate(page);
650 }
651 kunmap(page);
652 log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n",
653 inode->i_ino, bix, level, ofs, err);
654 return err;
655}
656
657int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
658{
659 struct super_block *sb = inode->i_sb;
660 struct logfs_super *super = logfs_super(sb);
661 struct logfs_object_header h;
662 u16 len;
663 int err;
664
665 super->s_flags |= LOGFS_SB_FLAG_DIRTY;
666 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
667 BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED);
668 if (!shadow->old_ofs)
669 return 0;
670
671 log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n",
672 shadow->ino, shadow->bix, shadow->gc_level,
673 shadow->old_ofs, shadow->new_ofs,
674 shadow->old_len, shadow->new_len);
675 err = read_obj_header(sb, shadow->old_ofs, &h);
676 LOGFS_BUG_ON(err, sb);
677 LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb);
678 LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix),
679 shrink_level(shadow->gc_level)), sb);
680
681 if (shadow->gc_level == 0)
682 len = be16_to_cpu(h.len);
683 else
684 len = obj_len(sb, h.type);
685 shadow->old_len = len + sizeof(h);
686 return 0;
687}
688
689void freeseg(struct super_block *sb, u32 segno)
690{
691 struct logfs_super *super = logfs_super(sb);
692 struct address_space *mapping = super->s_mapping_inode->i_mapping;
693 struct page *page;
694 u64 ofs, start, end;
695
696 start = dev_ofs(sb, segno, 0);
697 end = dev_ofs(sb, segno + 1, 0);
698 for (ofs = start; ofs < end; ofs += PAGE_SIZE) {
699 page = find_get_page(mapping, ofs >> PAGE_SHIFT);
700 if (!page)
701 continue;
702 ClearPagePrivate(page);
703 page_cache_release(page);
704 }
705}
706
707int logfs_open_area(struct logfs_area *area, size_t bytes)
708{
709 struct super_block *sb = area->a_sb;
710 struct logfs_super *super = logfs_super(sb);
711 int err, closed = 0;
712
713 if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize)
714 return 0;
715
716 if (area->a_is_open) {
717 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
718 u32 len = super->s_segsize - area->a_written_bytes;
719
720 log_gc("logfs_close_area(%x)\n", area->a_segno);
721 pad_wbuf(area, 1);
722 super->s_devops->writeseg(area->a_sb, ofs, len);
723 freeseg(sb, area->a_segno);
724 closed = 1;
725 }
726
727 area->a_used_bytes = 0;
728 area->a_written_bytes = 0;
729again:
730 area->a_ops->get_free_segment(area);
731 area->a_ops->get_erase_count(area);
732
733 log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level);
734 err = area->a_ops->erase_segment(area);
735 if (err) {
736 printk(KERN_WARNING "LogFS: Error erasing segment %x\n",
737 area->a_segno);
738 logfs_mark_segment_bad(sb, area->a_segno);
739 goto again;
740 }
741 area->a_is_open = 1;
742 return closed;
743}
744
745void logfs_sync_area(struct logfs_area *area)
746{
747 struct super_block *sb = area->a_sb;
748 struct logfs_super *super = logfs_super(sb);
749 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
750 u32 len = (area->a_used_bytes - area->a_written_bytes);
751
752 if (super->s_writesize)
753 len &= ~(super->s_writesize - 1);
754 if (len == 0)
755 return;
756 pad_wbuf(area, 0);
757 super->s_devops->writeseg(sb, ofs, len);
758 area->a_written_bytes += len;
759}
760
761void logfs_sync_segments(struct super_block *sb)
762{
763 struct logfs_super *super = logfs_super(sb);
764 int i;
765
766 for_each_area(i)
767 logfs_sync_area(super->s_area[i]);
768}
769
770/*
771 * Pick a free segment to be used for this area. Effectively takes a
772 * candidate from the free list (not really a candidate anymore).
773 */
774static void ostore_get_free_segment(struct logfs_area *area)
775{
776 struct super_block *sb = area->a_sb;
777 struct logfs_super *super = logfs_super(sb);
778
779 if (super->s_free_list.count == 0) {
780 printk(KERN_ERR"LOGFS: ran out of free segments\n");
781 LOGFS_BUG(sb);
782 }
783
784 area->a_segno = get_best_cand(sb, &super->s_free_list, NULL);
785}
786
787static void ostore_get_erase_count(struct logfs_area *area)
788{
789 struct logfs_segment_entry se;
790 u32 ec_level;
791
792 logfs_get_segment_entry(area->a_sb, area->a_segno, &se);
793 BUG_ON(se.ec_level == cpu_to_be32(BADSEG) ||
794 se.valid == cpu_to_be32(RESERVED));
795
796 ec_level = be32_to_cpu(se.ec_level);
797 area->a_erase_count = (ec_level >> 4) + 1;
798}
799
800static int ostore_erase_segment(struct logfs_area *area)
801{
802 struct super_block *sb = area->a_sb;
803 struct logfs_segment_header sh;
804 u64 ofs;
805 int err;
806
807 err = logfs_erase_segment(sb, area->a_segno, 0);
808 if (err)
809 return err;
810
811 sh.pad = 0;
812 sh.type = SEG_OSTORE;
813 sh.level = (__force u8)area->a_level;
814 sh.segno = cpu_to_be32(area->a_segno);
815 sh.ec = cpu_to_be32(area->a_erase_count);
816 sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
817 sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
818
819 logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count,
820 area->a_level);
821
822 ofs = dev_ofs(sb, area->a_segno, 0);
823 area->a_used_bytes = sizeof(sh);
824 logfs_buf_write(area, ofs, &sh, sizeof(sh));
825 return 0;
826}
827
828static const struct logfs_area_ops ostore_area_ops = {
829 .get_free_segment = ostore_get_free_segment,
830 .get_erase_count = ostore_get_erase_count,
831 .erase_segment = ostore_erase_segment,
832};
833
834static void free_area(struct logfs_area *area)
835{
836 if (area)
837 freeseg(area->a_sb, area->a_segno);
838 kfree(area);
839}
840
841static struct logfs_area *alloc_area(struct super_block *sb)
842{
843 struct logfs_area *area;
844
845 area = kzalloc(sizeof(*area), GFP_KERNEL);
846 if (!area)
847 return NULL;
848
849 area->a_sb = sb;
850 return area;
851}
852
853static void map_invalidatepage(struct page *page, unsigned long l)
854{
855 BUG();
856}
857
858static int map_releasepage(struct page *page, gfp_t g)
859{
860 /* Don't release these pages */
861 return 0;
862}
863
864static const struct address_space_operations mapping_aops = {
865 .invalidatepage = map_invalidatepage,
866 .releasepage = map_releasepage,
867 .set_page_dirty = __set_page_dirty_nobuffers,
868};
869
870int logfs_init_mapping(struct super_block *sb)
871{
872 struct logfs_super *super = logfs_super(sb);
873 struct address_space *mapping;
874 struct inode *inode;
875
876 inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING);
877 if (IS_ERR(inode))
878 return PTR_ERR(inode);
879 super->s_mapping_inode = inode;
880 mapping = inode->i_mapping;
881 mapping->a_ops = &mapping_aops;
882 /* Would it be possible to use __GFP_HIGHMEM as well? */
883 mapping_set_gfp_mask(mapping, GFP_NOFS);
884 return 0;
885}
886
887int logfs_init_areas(struct super_block *sb)
888{
889 struct logfs_super *super = logfs_super(sb);
890 int i = -1;
891
892 super->s_alias_pool = mempool_create_kmalloc_pool(600,
893 sizeof(struct object_alias_item));
894 if (!super->s_alias_pool)
895 return -ENOMEM;
896
897 super->s_journal_area = alloc_area(sb);
898 if (!super->s_journal_area)
899 goto err;
900
901 for_each_area(i) {
902 super->s_area[i] = alloc_area(sb);
903 if (!super->s_area[i])
904 goto err;
905 super->s_area[i]->a_level = GC_LEVEL(i);
906 super->s_area[i]->a_ops = &ostore_area_ops;
907 }
908 btree_init_mempool128(&super->s_object_alias_tree,
909 super->s_btree_pool);
910 return 0;
911
912err:
913 for (i--; i >= 0; i--)
914 free_area(super->s_area[i]);
915 free_area(super->s_journal_area);
916 logfs_mempool_destroy(super->s_alias_pool);
917 return -ENOMEM;
918}
919
920void logfs_cleanup_areas(struct super_block *sb)
921{
922 struct logfs_super *super = logfs_super(sb);
923 int i;
924
925 btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
926 for_each_area(i)
927 free_area(super->s_area[i]);
928 free_area(super->s_journal_area);
929 destroy_meta_inode(super->s_mapping_inode);
930}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
new file mode 100644
index 000000000000..5866ee6e1327
--- /dev/null
+++ b/fs/logfs/super.c
@@ -0,0 +1,657 @@
1/*
2 * fs/logfs/super.c
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Generally contains mount/umount code and also serves as a dump area for
9 * any functions that don't fit elsewhere and neither justify a file of their
10 * own.
11 */
12#include "logfs.h"
13#include <linux/bio.h>
14#include <linux/slab.h>
15#include <linux/blkdev.h>
16#include <linux/mtd/mtd.h>
17#include <linux/statfs.h>
18#include <linux/buffer_head.h>
19
20static DEFINE_MUTEX(emergency_mutex);
21static struct page *emergency_page;
22
23struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index)
24{
25 filler_t *filler = (filler_t *)mapping->a_ops->readpage;
26 struct page *page;
27 int err;
28
29 page = read_cache_page(mapping, index, filler, NULL);
30 if (page)
31 return page;
32
33 /* No more pages available, switch to emergency page */
34 printk(KERN_INFO"Logfs: Using emergency page\n");
35 mutex_lock(&emergency_mutex);
36 err = filler(NULL, emergency_page);
37 if (err) {
38 mutex_unlock(&emergency_mutex);
39 printk(KERN_EMERG"Logfs: Error reading emergency page\n");
40 return ERR_PTR(err);
41 }
42 return emergency_page;
43}
44
45void emergency_read_end(struct page *page)
46{
47 if (page == emergency_page)
48 mutex_unlock(&emergency_mutex);
49 else
50 page_cache_release(page);
51}
52
53static void dump_segfile(struct super_block *sb)
54{
55 struct logfs_super *super = logfs_super(sb);
56 struct logfs_segment_entry se;
57 u32 segno;
58
59 for (segno = 0; segno < super->s_no_segs; segno++) {
60 logfs_get_segment_entry(sb, segno, &se);
61 printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level),
62 be32_to_cpu(se.valid));
63 if (++segno < super->s_no_segs) {
64 logfs_get_segment_entry(sb, segno, &se);
65 printk(" %6x %8x", be32_to_cpu(se.ec_level),
66 be32_to_cpu(se.valid));
67 }
68 if (++segno < super->s_no_segs) {
69 logfs_get_segment_entry(sb, segno, &se);
70 printk(" %6x %8x", be32_to_cpu(se.ec_level),
71 be32_to_cpu(se.valid));
72 }
73 if (++segno < super->s_no_segs) {
74 logfs_get_segment_entry(sb, segno, &se);
75 printk(" %6x %8x", be32_to_cpu(se.ec_level),
76 be32_to_cpu(se.valid));
77 }
78 printk("\n");
79 }
80}
81
82/*
83 * logfs_crash_dump - dump debug information to device
84 *
85 * The LogFS superblock only occupies part of a segment. This function will
86 * write as much debug information as it can gather into the spare space.
87 */
88void logfs_crash_dump(struct super_block *sb)
89{
90 dump_segfile(sb);
91}
92
93/*
94 * TODO: move to lib/string.c
95 */
96/**
97 * memchr_inv - Find a character in an area of memory.
98 * @s: The memory area
99 * @c: The byte to search for
100 * @n: The size of the area.
101 *
102 * returns the address of the first character other than @c, or %NULL
103 * if the whole buffer contains just @c.
104 */
105void *memchr_inv(const void *s, int c, size_t n)
106{
107 const unsigned char *p = s;
108 while (n-- != 0)
109 if ((unsigned char)c != *p++)
110 return (void *)(p - 1);
111
112 return NULL;
113}
114
115/*
116 * FIXME: There should be a reserve for root, similar to ext2.
117 */
118int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
119{
120 struct super_block *sb = dentry->d_sb;
121 struct logfs_super *super = logfs_super(sb);
122
123 stats->f_type = LOGFS_MAGIC_U32;
124 stats->f_bsize = sb->s_blocksize;
125 stats->f_blocks = super->s_size >> LOGFS_BLOCK_BITS >> 3;
126 stats->f_bfree = super->s_free_bytes >> sb->s_blocksize_bits;
127 stats->f_bavail = super->s_free_bytes >> sb->s_blocksize_bits;
128 stats->f_files = 0;
129 stats->f_ffree = 0;
130 stats->f_namelen = LOGFS_MAX_NAMELEN;
131 return 0;
132}
133
134static int logfs_sb_set(struct super_block *sb, void *_super)
135{
136 struct logfs_super *super = _super;
137
138 sb->s_fs_info = super;
139 sb->s_mtd = super->s_mtd;
140 sb->s_bdev = super->s_bdev;
141 if (sb->s_bdev)
142 sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
143 if (sb->s_mtd)
144 sb->s_bdi = sb->s_mtd->backing_dev_info;
145 return 0;
146}
147
148static int logfs_sb_test(struct super_block *sb, void *_super)
149{
150 struct logfs_super *super = _super;
151 struct mtd_info *mtd = super->s_mtd;
152
153 if (mtd && sb->s_mtd == mtd)
154 return 1;
155 if (super->s_bdev && sb->s_bdev == super->s_bdev)
156 return 1;
157 return 0;
158}
159
160static void set_segment_header(struct logfs_segment_header *sh, u8 type,
161 u8 level, u32 segno, u32 ec)
162{
163 sh->pad = 0;
164 sh->type = type;
165 sh->level = level;
166 sh->segno = cpu_to_be32(segno);
167 sh->ec = cpu_to_be32(ec);
168 sh->gec = cpu_to_be64(segno);
169 sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4);
170}
171
172static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds,
173 u32 segno, u32 ec)
174{
175 struct logfs_super *super = logfs_super(sb);
176 struct logfs_segment_header *sh = &ds->ds_sh;
177 int i;
178
179 memset(ds, 0, sizeof(*ds));
180 set_segment_header(sh, SEG_SUPER, 0, segno, ec);
181
182 ds->ds_ifile_levels = super->s_ifile_levels;
183 ds->ds_iblock_levels = super->s_iblock_levels;
184 ds->ds_data_levels = super->s_data_levels; /* XXX: Remove */
185 ds->ds_segment_shift = super->s_segshift;
186 ds->ds_block_shift = sb->s_blocksize_bits;
187 ds->ds_write_shift = super->s_writeshift;
188 ds->ds_filesystem_size = cpu_to_be64(super->s_size);
189 ds->ds_segment_size = cpu_to_be32(super->s_segsize);
190 ds->ds_bad_seg_reserve = cpu_to_be32(super->s_bad_seg_reserve);
191 ds->ds_feature_incompat = cpu_to_be64(super->s_feature_incompat);
192 ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat);
193 ds->ds_feature_compat = cpu_to_be64(super->s_feature_compat);
194 ds->ds_feature_flags = cpu_to_be64(super->s_feature_flags);
195 ds->ds_root_reserve = cpu_to_be64(super->s_root_reserve);
196 ds->ds_speed_reserve = cpu_to_be64(super->s_speed_reserve);
197 journal_for_each(i)
198 ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]);
199 ds->ds_magic = cpu_to_be64(LOGFS_MAGIC);
200 ds->ds_crc = logfs_crc32(ds, sizeof(*ds),
201 LOGFS_SEGMENT_HEADERSIZE + 12);
202}
203
204static int write_one_sb(struct super_block *sb,
205 struct page *(*find_sb)(struct super_block *sb, u64 *ofs))
206{
207 struct logfs_super *super = logfs_super(sb);
208 struct logfs_disk_super *ds;
209 struct logfs_segment_entry se;
210 struct page *page;
211 u64 ofs;
212 u32 ec, segno;
213 int err;
214
215 page = find_sb(sb, &ofs);
216 if (!page)
217 return -EIO;
218 ds = page_address(page);
219 segno = seg_no(sb, ofs);
220 logfs_get_segment_entry(sb, segno, &se);
221 ec = be32_to_cpu(se.ec_level) >> 4;
222 ec++;
223 logfs_set_segment_erased(sb, segno, ec, 0);
224 logfs_write_ds(sb, ds, segno, ec);
225 err = super->s_devops->write_sb(sb, page);
226 page_cache_release(page);
227 return err;
228}
229
230int logfs_write_sb(struct super_block *sb)
231{
232 struct logfs_super *super = logfs_super(sb);
233 int err;
234
235 /* First superblock */
236 err = write_one_sb(sb, super->s_devops->find_first_sb);
237 if (err)
238 return err;
239
240 /* Last superblock */
241 err = write_one_sb(sb, super->s_devops->find_last_sb);
242 if (err)
243 return err;
244 return 0;
245}
246
247static int ds_cmp(const void *ds0, const void *ds1)
248{
249 size_t len = sizeof(struct logfs_disk_super);
250
251 /* We know the segment headers differ, so ignore them */
252 len -= LOGFS_SEGMENT_HEADERSIZE;
253 ds0 += LOGFS_SEGMENT_HEADERSIZE;
254 ds1 += LOGFS_SEGMENT_HEADERSIZE;
255 return memcmp(ds0, ds1, len);
256}
257
258static int logfs_recover_sb(struct super_block *sb)
259{
260 struct logfs_super *super = logfs_super(sb);
261 struct logfs_disk_super _ds0, *ds0 = &_ds0;
262 struct logfs_disk_super _ds1, *ds1 = &_ds1;
263 int err, valid0, valid1;
264
265 /* read first superblock */
266 err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0);
267 if (err)
268 return err;
269 /* read last superblock */
270 err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1);
271 if (err)
272 return err;
273 valid0 = logfs_check_ds(ds0) == 0;
274 valid1 = logfs_check_ds(ds1) == 0;
275
276 if (!valid0 && valid1) {
277 printk(KERN_INFO"First superblock is invalid - fixing.\n");
278 return write_one_sb(sb, super->s_devops->find_first_sb);
279 }
280 if (valid0 && !valid1) {
281 printk(KERN_INFO"Last superblock is invalid - fixing.\n");
282 return write_one_sb(sb, super->s_devops->find_last_sb);
283 }
284 if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
285 printk(KERN_INFO"Superblocks don't match - fixing.\n");
286 return logfs_write_sb(sb);
287 }
288 /* If neither is valid now, something's wrong. Didn't we properly
289 * check them before?!? */
290 BUG_ON(!valid0 && !valid1);
291 return 0;
292}
293
294static int logfs_make_writeable(struct super_block *sb)
295{
296 int err;
297
298 err = logfs_open_segfile(sb);
299 if (err)
300 return err;
301
302 /* Repair any broken superblock copies */
303 err = logfs_recover_sb(sb);
304 if (err)
305 return err;
306
307 /* Check areas for trailing unaccounted data */
308 err = logfs_check_areas(sb);
309 if (err)
310 return err;
311
312 /* Do one GC pass before any data gets dirtied */
313 logfs_gc_pass(sb);
314
315 /* after all initializations are done, replay the journal
316 * for rw-mounts, if necessary */
317 err = logfs_replay_journal(sb);
318 if (err)
319 return err;
320
321 return 0;
322}
323
324static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
325{
326 struct logfs_super *super = logfs_super(sb);
327 struct inode *rootdir;
328 int err;
329
330 /* root dir */
331 rootdir = logfs_iget(sb, LOGFS_INO_ROOT);
332 if (IS_ERR(rootdir))
333 goto fail;
334
335 sb->s_root = d_alloc_root(rootdir);
336 if (!sb->s_root)
337 goto fail2;
338
339 super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
340 if (!super->s_erase_page)
341 goto fail2;
342 memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
343
344 /* FIXME: check for read-only mounts */
345 err = logfs_make_writeable(sb);
346 if (err)
347 goto fail3;
348
349 log_super("LogFS: Finished mounting\n");
350 simple_set_mnt(mnt, sb);
351 return 0;
352
353fail3:
354 __free_page(super->s_erase_page);
355fail2:
356 iput(rootdir);
357fail:
358 iput(logfs_super(sb)->s_master_inode);
359 return -EIO;
360}
361
362int logfs_check_ds(struct logfs_disk_super *ds)
363{
364 struct logfs_segment_header *sh = &ds->ds_sh;
365
366 if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC))
367 return -EINVAL;
368 if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4))
369 return -EINVAL;
370 if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds),
371 LOGFS_SEGMENT_HEADERSIZE + 12))
372 return -EINVAL;
373 return 0;
374}
375
376static struct page *find_super_block(struct super_block *sb)
377{
378 struct logfs_super *super = logfs_super(sb);
379 struct page *first, *last;
380
381 first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]);
382 if (!first || IS_ERR(first))
383 return NULL;
384 last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
385 if (!last || IS_ERR(first)) {
386 page_cache_release(first);
387 return NULL;
388 }
389
390 if (!logfs_check_ds(page_address(first))) {
391 page_cache_release(last);
392 return first;
393 }
394
395 /* First one didn't work, try the second superblock */
396 if (!logfs_check_ds(page_address(last))) {
397 page_cache_release(first);
398 return last;
399 }
400
401 /* Neither worked, sorry folks */
402 page_cache_release(first);
403 page_cache_release(last);
404 return NULL;
405}
406
407static int __logfs_read_sb(struct super_block *sb)
408{
409 struct logfs_super *super = logfs_super(sb);
410 struct page *page;
411 struct logfs_disk_super *ds;
412 int i;
413
414 page = find_super_block(sb);
415 if (!page)
416 return -EIO;
417
418 ds = page_address(page);
419 super->s_size = be64_to_cpu(ds->ds_filesystem_size);
420 super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve);
421 super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve);
422 super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve);
423 super->s_segsize = 1 << ds->ds_segment_shift;
424 super->s_segmask = (1 << ds->ds_segment_shift) - 1;
425 super->s_segshift = ds->ds_segment_shift;
426 sb->s_blocksize = 1 << ds->ds_block_shift;
427 sb->s_blocksize_bits = ds->ds_block_shift;
428 super->s_writesize = 1 << ds->ds_write_shift;
429 super->s_writeshift = ds->ds_write_shift;
430 super->s_no_segs = super->s_size >> super->s_segshift;
431 super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits;
432 super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat);
433 super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat);
434 super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat);
435 super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags);
436
437 journal_for_each(i)
438 super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]);
439
440 super->s_ifile_levels = ds->ds_ifile_levels;
441 super->s_iblock_levels = ds->ds_iblock_levels;
442 super->s_data_levels = ds->ds_data_levels;
443 super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
444 + super->s_data_levels;
445 page_cache_release(page);
446 return 0;
447}
448
449static int logfs_read_sb(struct super_block *sb, int read_only)
450{
451 struct logfs_super *super = logfs_super(sb);
452 int ret;
453
454 super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL);
455 if (!super->s_btree_pool)
456 return -ENOMEM;
457
458 btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
459 btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
460 btree_init_mempool32(&super->s_shadow_tree.segment_map,
461 super->s_btree_pool);
462
463 ret = logfs_init_mapping(sb);
464 if (ret)
465 return ret;
466
467 ret = __logfs_read_sb(sb);
468 if (ret)
469 return ret;
470
471 if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT)
472 return -EIO;
473 if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) &&
474 !read_only)
475 return -EIO;
476
477 mutex_init(&super->s_dirop_mutex);
478 mutex_init(&super->s_object_alias_mutex);
479 INIT_LIST_HEAD(&super->s_freeing_list);
480
481 ret = logfs_init_rw(sb);
482 if (ret)
483 return ret;
484
485 ret = logfs_init_areas(sb);
486 if (ret)
487 return ret;
488
489 ret = logfs_init_gc(sb);
490 if (ret)
491 return ret;
492
493 ret = logfs_init_journal(sb);
494 if (ret)
495 return ret;
496
497 return 0;
498}
499
500static void logfs_kill_sb(struct super_block *sb)
501{
502 struct logfs_super *super = logfs_super(sb);
503
504 log_super("LogFS: Start unmounting\n");
505 /* Alias entries slow down mount, so evict as many as possible */
506 sync_filesystem(sb);
507 logfs_write_anchor(sb);
508
509 /*
510 * From this point on alias entries are simply dropped - and any
511 * writes to the object store are considered bugs.
512 */
513 super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
514 log_super("LogFS: Now in shutdown\n");
515 generic_shutdown_super(sb);
516
517 BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
518
519 logfs_cleanup_gc(sb);
520 logfs_cleanup_journal(sb);
521 logfs_cleanup_areas(sb);
522 logfs_cleanup_rw(sb);
523 if (super->s_erase_page)
524 __free_page(super->s_erase_page);
525 super->s_devops->put_device(sb);
526 logfs_mempool_destroy(super->s_btree_pool);
527 logfs_mempool_destroy(super->s_alias_pool);
528 kfree(super);
529 log_super("LogFS: Finished unmounting\n");
530}
531
532int logfs_get_sb_device(struct file_system_type *type, int flags,
533 struct mtd_info *mtd, struct block_device *bdev,
534 const struct logfs_device_ops *devops, struct vfsmount *mnt)
535{
536 struct logfs_super *super;
537 struct super_block *sb;
538 int err = -ENOMEM;
539 static int mount_count;
540
541 log_super("LogFS: Start mount %x\n", mount_count++);
542 super = kzalloc(sizeof(*super), GFP_KERNEL);
543 if (!super)
544 goto err0;
545
546 super->s_mtd = mtd;
547 super->s_bdev = bdev;
548 err = -EINVAL;
549 sb = sget(type, logfs_sb_test, logfs_sb_set, super);
550 if (IS_ERR(sb))
551 goto err0;
552
553 if (sb->s_root) {
554 /* Device is already in use */
555 err = 0;
556 simple_set_mnt(mnt, sb);
557 goto err0;
558 }
559
560 super->s_devops = devops;
561
562 /*
563 * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache
564 * only covers 16TB and the upper 8TB are used for indirect blocks.
565 * On 64bit system we could bump up the limit, but that would make
566 * the filesystem incompatible with 32bit systems.
567 */
568 sb->s_maxbytes = (1ull << 43) - 1;
569 sb->s_op = &logfs_super_operations;
570 sb->s_flags = flags | MS_NOATIME;
571
572 err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY);
573 if (err)
574 goto err1;
575
576 sb->s_flags |= MS_ACTIVE;
577 err = logfs_get_sb_final(sb, mnt);
578 if (err)
579 goto err1;
580 return 0;
581
582err1:
583 deactivate_locked_super(sb);
584 return err;
585err0:
586 kfree(super);
587 //devops->put_device(sb);
588 return err;
589}
590
591static int logfs_get_sb(struct file_system_type *type, int flags,
592 const char *devname, void *data, struct vfsmount *mnt)
593{
594 ulong mtdnr;
595
596 if (!devname)
597 return logfs_get_sb_bdev(type, flags, devname, mnt);
598 if (strncmp(devname, "mtd", 3))
599 return logfs_get_sb_bdev(type, flags, devname, mnt);
600
601 {
602 char *garbage;
603 mtdnr = simple_strtoul(devname+3, &garbage, 0);
604 if (*garbage)
605 return -EINVAL;
606 }
607
608 return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
609}
610
611static struct file_system_type logfs_fs_type = {
612 .owner = THIS_MODULE,
613 .name = "logfs",
614 .get_sb = logfs_get_sb,
615 .kill_sb = logfs_kill_sb,
616 .fs_flags = FS_REQUIRES_DEV,
617
618};
619
620static int __init logfs_init(void)
621{
622 int ret;
623
624 emergency_page = alloc_pages(GFP_KERNEL, 0);
625 if (!emergency_page)
626 return -ENOMEM;
627
628 ret = logfs_compr_init();
629 if (ret)
630 goto out1;
631
632 ret = logfs_init_inode_cache();
633 if (ret)
634 goto out2;
635
636 return register_filesystem(&logfs_fs_type);
637out2:
638 logfs_compr_exit();
639out1:
640 __free_pages(emergency_page, 0);
641 return ret;
642}
643
644static void __exit logfs_exit(void)
645{
646 unregister_filesystem(&logfs_fs_type);
647 logfs_destroy_inode_cache();
648 logfs_compr_exit();
649 __free_pages(emergency_page, 0);
650}
651
652module_init(logfs_init);
653module_exit(logfs_exit);
654
655MODULE_LICENSE("GPL v2");
656MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
657MODULE_DESCRIPTION("scalable flash filesystem");
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 74ea82d72164..756f8c93780c 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -17,8 +17,10 @@
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/highuid.h> 18#include <linux/highuid.h>
19#include <linux/vfs.h> 19#include <linux/vfs.h>
20#include <linux/writeback.h>
20 21
21static int minix_write_inode(struct inode * inode, int wait); 22static int minix_write_inode(struct inode *inode,
23 struct writeback_control *wbc);
22static int minix_statfs(struct dentry *dentry, struct kstatfs *buf); 24static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
23static int minix_remount (struct super_block * sb, int * flags, char * data); 25static int minix_remount (struct super_block * sb, int * flags, char * data);
24 26
@@ -552,7 +554,7 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
552 return bh; 554 return bh;
553} 555}
554 556
555static int minix_write_inode(struct inode *inode, int wait) 557static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
556{ 558{
557 int err = 0; 559 int err = 0;
558 struct buffer_head *bh; 560 struct buffer_head *bh;
@@ -563,7 +565,7 @@ static int minix_write_inode(struct inode *inode, int wait)
563 bh = V2_minix_update_inode(inode); 565 bh = V2_minix_update_inode(inode);
564 if (!bh) 566 if (!bh)
565 return -EIO; 567 return -EIO;
566 if (wait && buffer_dirty(bh)) { 568 if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) {
567 sync_dirty_buffer(bh); 569 sync_dirty_buffer(bh);
568 if (buffer_req(bh) && !buffer_uptodate(bh)) { 570 if (buffer_req(bh) && !buffer_uptodate(bh)) {
569 printk("IO error syncing minix inode [%s:%08lx]\n", 571 printk("IO error syncing minix inode [%s:%08lx]\n",
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 82d6554b02fe..282e15ad8cd8 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -1,4 +1,5 @@
1#include <linux/buffer_head.h> 1#include <linux/buffer_head.h>
2#include <linux/slab.h>
2#include "minix.h" 3#include "minix.h"
3 4
4enum {DEPTH = 3, DIRECT = 7}; /* Only double indirect */ 5enum {DEPTH = 3, DIRECT = 7}; /* Only double indirect */
diff --git a/fs/mpage.c b/fs/mpage.c
index 42381bd6543b..fd56ca2ea556 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -16,6 +16,7 @@
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/kdev_t.h> 18#include <linux/kdev_t.h>
19#include <linux/gfp.h>
19#include <linux/bio.h> 20#include <linux/bio.h>
20#include <linux/fs.h> 21#include <linux/fs.h>
21#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
@@ -561,7 +562,7 @@ page_is_mapped:
561 if (page->index >= end_index) { 562 if (page->index >= end_index) {
562 /* 563 /*
563 * The page straddles i_size. It must be zeroed out on each 564 * The page straddles i_size. It must be zeroed out on each
564 * and every writepage invokation because it may be mmapped. 565 * and every writepage invocation because it may be mmapped.
565 * "A file is mapped in multiples of the page size. For a file 566 * "A file is mapped in multiples of the page size. For a file
566 * that is not a multiple of the page size, the remaining memory 567 * that is not a multiple of the page size, the remaining memory
567 * is zeroed when mapped, and writes to that region are not 568 * is zeroed when mapped, and writes to that region are not
diff --git a/fs/namei.c b/fs/namei.c
index 68921d9b5302..a7dce91a7e42 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -19,7 +19,6 @@
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/namei.h> 21#include <linux/namei.h>
22#include <linux/quotaops.h>
23#include <linux/pagemap.h> 22#include <linux/pagemap.h>
24#include <linux/fsnotify.h> 23#include <linux/fsnotify.h>
25#include <linux/personality.h> 24#include <linux/personality.h>
@@ -232,6 +231,7 @@ int generic_permission(struct inode *inode, int mask,
232 /* 231 /*
233 * Searching includes executable on directories, else just read. 232 * Searching includes executable on directories, else just read.
234 */ 233 */
234 mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
235 if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) 235 if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
236 if (capable(CAP_DAC_READ_SEARCH)) 236 if (capable(CAP_DAC_READ_SEARCH))
237 return 0; 237 return 0;
@@ -497,8 +497,6 @@ static int link_path_walk(const char *, struct nameidata *);
497 497
498static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) 498static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
499{ 499{
500 int res = 0;
501 char *name;
502 if (IS_ERR(link)) 500 if (IS_ERR(link))
503 goto fail; 501 goto fail;
504 502
@@ -509,22 +507,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
509 path_get(&nd->root); 507 path_get(&nd->root);
510 } 508 }
511 509
512 res = link_path_walk(link, nd); 510 return link_path_walk(link, nd);
513 if (nd->depth || res || nd->last_type!=LAST_NORM)
514 return res;
515 /*
516 * If it is an iterative symlinks resolution in open_namei() we
517 * have to copy the last component. And all that crap because of
518 * bloody create() on broken symlinks. Furrfu...
519 */
520 name = __getname();
521 if (unlikely(!name)) {
522 path_put(&nd->path);
523 return -ENOMEM;
524 }
525 strcpy(name, nd->last.name);
526 nd->last.name = name;
527 return 0;
528fail: 511fail:
529 path_put(&nd->path); 512 path_put(&nd->path);
530 return PTR_ERR(link); 513 return PTR_ERR(link);
@@ -546,10 +529,10 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
546 nd->path.dentry = path->dentry; 529 nd->path.dentry = path->dentry;
547} 530}
548 531
549static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd) 532static __always_inline int
533__do_follow_link(struct path *path, struct nameidata *nd, void **p)
550{ 534{
551 int error; 535 int error;
552 void *cookie;
553 struct dentry *dentry = path->dentry; 536 struct dentry *dentry = path->dentry;
554 537
555 touch_atime(path->mnt, dentry); 538 touch_atime(path->mnt, dentry);
@@ -560,9 +543,10 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
560 dget(dentry); 543 dget(dentry);
561 } 544 }
562 mntget(path->mnt); 545 mntget(path->mnt);
563 cookie = dentry->d_inode->i_op->follow_link(dentry, nd); 546 nd->last_type = LAST_BIND;
564 error = PTR_ERR(cookie); 547 *p = dentry->d_inode->i_op->follow_link(dentry, nd);
565 if (!IS_ERR(cookie)) { 548 error = PTR_ERR(*p);
549 if (!IS_ERR(*p)) {
566 char *s = nd_get_link(nd); 550 char *s = nd_get_link(nd);
567 error = 0; 551 error = 0;
568 if (s) 552 if (s)
@@ -572,8 +556,6 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
572 if (error) 556 if (error)
573 path_put(&nd->path); 557 path_put(&nd->path);
574 } 558 }
575 if (dentry->d_inode->i_op->put_link)
576 dentry->d_inode->i_op->put_link(dentry, nd, cookie);
577 } 559 }
578 return error; 560 return error;
579} 561}
@@ -587,6 +569,7 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
587 */ 569 */
588static inline int do_follow_link(struct path *path, struct nameidata *nd) 570static inline int do_follow_link(struct path *path, struct nameidata *nd)
589{ 571{
572 void *cookie;
590 int err = -ELOOP; 573 int err = -ELOOP;
591 if (current->link_count >= MAX_NESTED_LINKS) 574 if (current->link_count >= MAX_NESTED_LINKS)
592 goto loop; 575 goto loop;
@@ -600,7 +583,9 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd)
600 current->link_count++; 583 current->link_count++;
601 current->total_link_count++; 584 current->total_link_count++;
602 nd->depth++; 585 nd->depth++;
603 err = __do_follow_link(path, nd); 586 err = __do_follow_link(path, nd, &cookie);
587 if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
588 path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
604 path_put(path); 589 path_put(path);
605 current->link_count--; 590 current->link_count--;
606 nd->depth--; 591 nd->depth--;
@@ -687,33 +672,20 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
687 set_root(nd); 672 set_root(nd);
688 673
689 while(1) { 674 while(1) {
690 struct vfsmount *parent;
691 struct dentry *old = nd->path.dentry; 675 struct dentry *old = nd->path.dentry;
692 676
693 if (nd->path.dentry == nd->root.dentry && 677 if (nd->path.dentry == nd->root.dentry &&
694 nd->path.mnt == nd->root.mnt) { 678 nd->path.mnt == nd->root.mnt) {
695 break; 679 break;
696 } 680 }
697 spin_lock(&dcache_lock);
698 if (nd->path.dentry != nd->path.mnt->mnt_root) { 681 if (nd->path.dentry != nd->path.mnt->mnt_root) {
699 nd->path.dentry = dget(nd->path.dentry->d_parent); 682 /* rare case of legitimate dget_parent()... */
700 spin_unlock(&dcache_lock); 683 nd->path.dentry = dget_parent(nd->path.dentry);
701 dput(old); 684 dput(old);
702 break; 685 break;
703 } 686 }
704 spin_unlock(&dcache_lock); 687 if (!follow_up(&nd->path))
705 spin_lock(&vfsmount_lock);
706 parent = nd->path.mnt->mnt_parent;
707 if (parent == nd->path.mnt) {
708 spin_unlock(&vfsmount_lock);
709 break; 688 break;
710 }
711 mntget(parent);
712 nd->path.dentry = dget(nd->path.mnt->mnt_mountpoint);
713 spin_unlock(&vfsmount_lock);
714 dput(old);
715 mntput(nd->path.mnt);
716 nd->path.mnt = parent;
717 } 689 }
718 follow_mount(&nd->path); 690 follow_mount(&nd->path);
719} 691}
@@ -821,6 +793,17 @@ fail:
821} 793}
822 794
823/* 795/*
796 * This is a temporary kludge to deal with "automount" symlinks; proper
797 * solution is to trigger them on follow_mount(), so that do_lookup()
798 * would DTRT. To be killed before 2.6.34-final.
799 */
800static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
801{
802 return inode && unlikely(inode->i_op->follow_link) &&
803 ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
804}
805
806/*
824 * Name resolution. 807 * Name resolution.
825 * This is the basic name resolution function, turning a pathname into 808 * This is the basic name resolution function, turning a pathname into
826 * the final dentry. We expect 'base' to be positive and a directory. 809 * the final dentry. We expect 'base' to be positive and a directory.
@@ -940,8 +923,7 @@ last_component:
940 if (err) 923 if (err)
941 break; 924 break;
942 inode = next.dentry->d_inode; 925 inode = next.dentry->d_inode;
943 if ((lookup_flags & LOOKUP_FOLLOW) 926 if (follow_on_final(inode, lookup_flags)) {
944 && inode && inode->i_op->follow_link) {
945 err = do_follow_link(&next, nd); 927 err = do_follow_link(&next, nd);
946 if (err) 928 if (err)
947 goto return_err; 929 goto return_err;
@@ -1335,7 +1317,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
1335 return -ENOENT; 1317 return -ENOENT;
1336 1318
1337 BUG_ON(victim->d_parent->d_inode != dir); 1319 BUG_ON(victim->d_parent->d_inode != dir);
1338 audit_inode_child(victim->d_name.name, victim, dir); 1320 audit_inode_child(victim, dir);
1339 1321
1340 error = inode_permission(dir, MAY_WRITE | MAY_EXEC); 1322 error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
1341 if (error) 1323 if (error)
@@ -1376,22 +1358,6 @@ static inline int may_create(struct inode *dir, struct dentry *child)
1376 return inode_permission(dir, MAY_WRITE | MAY_EXEC); 1358 return inode_permission(dir, MAY_WRITE | MAY_EXEC);
1377} 1359}
1378 1360
1379/*
1380 * O_DIRECTORY translates into forcing a directory lookup.
1381 */
1382static inline int lookup_flags(unsigned int f)
1383{
1384 unsigned long retval = LOOKUP_FOLLOW;
1385
1386 if (f & O_NOFOLLOW)
1387 retval &= ~LOOKUP_FOLLOW;
1388
1389 if (f & O_DIRECTORY)
1390 retval |= LOOKUP_DIRECTORY;
1391
1392 return retval;
1393}
1394
1395/* 1361/*
1396 * p1 and p2 should be directories on the same fs. 1362 * p1 and p2 should be directories on the same fs.
1397 */ 1363 */
@@ -1449,7 +1415,6 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1449 error = security_inode_create(dir, dentry, mode); 1415 error = security_inode_create(dir, dentry, mode);
1450 if (error) 1416 if (error)
1451 return error; 1417 return error;
1452 vfs_dq_init(dir);
1453 error = dir->i_op->create(dir, dentry, mode, nd); 1418 error = dir->i_op->create(dir, dentry, mode, nd);
1454 if (!error) 1419 if (!error)
1455 fsnotify_create(dir, dentry); 1420 fsnotify_create(dir, dentry);
@@ -1491,7 +1456,7 @@ int may_open(struct path *path, int acc_mode, int flag)
1491 * An append-only file must be opened in append mode for writing. 1456 * An append-only file must be opened in append mode for writing.
1492 */ 1457 */
1493 if (IS_APPEND(inode)) { 1458 if (IS_APPEND(inode)) {
1494 if ((flag & FMODE_WRITE) && !(flag & O_APPEND)) 1459 if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
1495 return -EPERM; 1460 return -EPERM;
1496 if (flag & O_TRUNC) 1461 if (flag & O_TRUNC)
1497 return -EPERM; 1462 return -EPERM;
@@ -1535,7 +1500,7 @@ static int handle_truncate(struct path *path)
1535 * what get passed to sys_open(). 1500 * what get passed to sys_open().
1536 */ 1501 */
1537static int __open_namei_create(struct nameidata *nd, struct path *path, 1502static int __open_namei_create(struct nameidata *nd, struct path *path,
1538 int flag, int mode) 1503 int open_flag, int mode)
1539{ 1504{
1540 int error; 1505 int error;
1541 struct dentry *dir = nd->path.dentry; 1506 struct dentry *dir = nd->path.dentry;
@@ -1553,7 +1518,7 @@ out_unlock:
1553 if (error) 1518 if (error)
1554 return error; 1519 return error;
1555 /* Don't check for write permission, don't truncate */ 1520 /* Don't check for write permission, don't truncate */
1556 return may_open(&nd->path, 0, flag & ~O_TRUNC); 1521 return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
1557} 1522}
1558 1523
1559/* 1524/*
@@ -1591,125 +1556,133 @@ static int open_will_truncate(int flag, struct inode *inode)
1591 return (flag & O_TRUNC); 1556 return (flag & O_TRUNC);
1592} 1557}
1593 1558
1594/* 1559static struct file *finish_open(struct nameidata *nd,
1595 * Note that the low bits of the passed in "open_flag" 1560 int open_flag, int acc_mode)
1596 * are not the same as in the local variable "flag". See
1597 * open_to_namei_flags() for more details.
1598 */
1599struct file *do_filp_open(int dfd, const char *pathname,
1600 int open_flag, int mode, int acc_mode)
1601{ 1561{
1602 struct file *filp; 1562 struct file *filp;
1603 struct nameidata nd;
1604 int error;
1605 struct path path, save;
1606 struct dentry *dir;
1607 int count = 0;
1608 int will_truncate; 1563 int will_truncate;
1609 int flag = open_to_namei_flags(open_flag); 1564 int error;
1610 1565
1566 will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
1567 if (will_truncate) {
1568 error = mnt_want_write(nd->path.mnt);
1569 if (error)
1570 goto exit;
1571 }
1572 error = may_open(&nd->path, acc_mode, open_flag);
1573 if (error) {
1574 if (will_truncate)
1575 mnt_drop_write(nd->path.mnt);
1576 goto exit;
1577 }
1578 filp = nameidata_to_filp(nd);
1579 if (!IS_ERR(filp)) {
1580 error = ima_file_check(filp, acc_mode);
1581 if (error) {
1582 fput(filp);
1583 filp = ERR_PTR(error);
1584 }
1585 }
1586 if (!IS_ERR(filp)) {
1587 if (will_truncate) {
1588 error = handle_truncate(&nd->path);
1589 if (error) {
1590 fput(filp);
1591 filp = ERR_PTR(error);
1592 }
1593 }
1594 }
1611 /* 1595 /*
1612 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only 1596 * It is now safe to drop the mnt write
1613 * check for O_DSYNC if the need any syncing at all we enforce it's 1597 * because the filp has had a write taken
1614 * always set instead of having to deal with possibly weird behaviour 1598 * on its behalf.
1615 * for malicious applications setting only __O_SYNC.
1616 */ 1599 */
1617 if (open_flag & __O_SYNC) 1600 if (will_truncate)
1618 open_flag |= O_DSYNC; 1601 mnt_drop_write(nd->path.mnt);
1619 1602 return filp;
1620 if (!acc_mode)
1621 acc_mode = MAY_OPEN | ACC_MODE(flag);
1622 1603
1623 /* O_TRUNC implies we need access checks for write permissions */ 1604exit:
1624 if (flag & O_TRUNC) 1605 if (!IS_ERR(nd->intent.open.file))
1625 acc_mode |= MAY_WRITE; 1606 release_open_intent(nd);
1607 path_put(&nd->path);
1608 return ERR_PTR(error);
1609}
1626 1610
1627 /* Allow the LSM permission hook to distinguish append 1611static struct file *do_last(struct nameidata *nd, struct path *path,
1628 access from general write access. */ 1612 int open_flag, int acc_mode,
1629 if (flag & O_APPEND) 1613 int mode, const char *pathname)
1630 acc_mode |= MAY_APPEND; 1614{
1615 struct dentry *dir = nd->path.dentry;
1616 struct file *filp;
1617 int error = -EISDIR;
1631 1618
1632 /* 1619 switch (nd->last_type) {
1633 * The simplest case - just a plain lookup. 1620 case LAST_DOTDOT:
1634 */ 1621 follow_dotdot(nd);
1635 if (!(flag & O_CREAT)) { 1622 dir = nd->path.dentry;
1636 filp = get_empty_filp(); 1623 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
1637 1624 if (!dir->d_op->d_revalidate(dir, nd)) {
1638 if (filp == NULL) 1625 error = -ESTALE;
1639 return ERR_PTR(-ENFILE); 1626 goto exit;
1640 nd.intent.open.file = filp;
1641 filp->f_flags = open_flag;
1642 nd.intent.open.flags = flag;
1643 nd.intent.open.create_mode = 0;
1644 error = do_path_lookup(dfd, pathname,
1645 lookup_flags(flag)|LOOKUP_OPEN, &nd);
1646 if (IS_ERR(nd.intent.open.file)) {
1647 if (error == 0) {
1648 error = PTR_ERR(nd.intent.open.file);
1649 path_put(&nd.path);
1650 } 1627 }
1651 } else if (error) 1628 }
1652 release_open_intent(&nd); 1629 /* fallthrough */
1653 if (error) 1630 case LAST_DOT:
1654 return ERR_PTR(error); 1631 case LAST_ROOT:
1632 if (open_flag & O_CREAT)
1633 goto exit;
1634 /* fallthrough */
1635 case LAST_BIND:
1636 audit_inode(pathname, dir);
1655 goto ok; 1637 goto ok;
1656 } 1638 }
1657 1639
1658 /* 1640 /* trailing slashes? */
1659 * Create - we need to know the parent. 1641 if (nd->last.name[nd->last.len]) {
1660 */ 1642 if (open_flag & O_CREAT)
1661 error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); 1643 goto exit;
1662 if (error) 1644 nd->flags |= LOOKUP_DIRECTORY;
1663 return ERR_PTR(error);
1664 error = path_walk(pathname, &nd);
1665 if (error) {
1666 if (nd.root.mnt)
1667 path_put(&nd.root);
1668 return ERR_PTR(error);
1669 } 1645 }
1670 if (unlikely(!audit_dummy_context()))
1671 audit_inode(pathname, nd.path.dentry);
1672 1646
1673 /* 1647 /* just plain open? */
1674 * We have the parent and last component. First of all, check 1648 if (!(open_flag & O_CREAT)) {
1675 * that we are not asked to creat(2) an obvious directory - that 1649 error = do_lookup(nd, &nd->last, path);
1676 * will not do. 1650 if (error)
1677 */ 1651 goto exit;
1678 error = -EISDIR; 1652 error = -ENOENT;
1679 if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len]) 1653 if (!path->dentry->d_inode)
1680 goto exit_parent; 1654 goto exit_dput;
1655 if (path->dentry->d_inode->i_op->follow_link)
1656 return NULL;
1657 error = -ENOTDIR;
1658 if (nd->flags & LOOKUP_DIRECTORY) {
1659 if (!path->dentry->d_inode->i_op->lookup)
1660 goto exit_dput;
1661 }
1662 path_to_nameidata(path, nd);
1663 audit_inode(pathname, nd->path.dentry);
1664 goto ok;
1665 }
1681 1666
1682 error = -ENFILE; 1667 /* OK, it's O_CREAT */
1683 filp = get_empty_filp();
1684 if (filp == NULL)
1685 goto exit_parent;
1686 nd.intent.open.file = filp;
1687 filp->f_flags = open_flag;
1688 nd.intent.open.flags = flag;
1689 nd.intent.open.create_mode = mode;
1690 dir = nd.path.dentry;
1691 nd.flags &= ~LOOKUP_PARENT;
1692 nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
1693 if (flag & O_EXCL)
1694 nd.flags |= LOOKUP_EXCL;
1695 mutex_lock(&dir->d_inode->i_mutex); 1668 mutex_lock(&dir->d_inode->i_mutex);
1696 path.dentry = lookup_hash(&nd);
1697 path.mnt = nd.path.mnt;
1698 1669
1699do_last: 1670 path->dentry = lookup_hash(nd);
1700 error = PTR_ERR(path.dentry); 1671 path->mnt = nd->path.mnt;
1701 if (IS_ERR(path.dentry)) { 1672
1673 error = PTR_ERR(path->dentry);
1674 if (IS_ERR(path->dentry)) {
1702 mutex_unlock(&dir->d_inode->i_mutex); 1675 mutex_unlock(&dir->d_inode->i_mutex);
1703 goto exit; 1676 goto exit;
1704 } 1677 }
1705 1678
1706 if (IS_ERR(nd.intent.open.file)) { 1679 if (IS_ERR(nd->intent.open.file)) {
1707 error = PTR_ERR(nd.intent.open.file); 1680 error = PTR_ERR(nd->intent.open.file);
1708 goto exit_mutex_unlock; 1681 goto exit_mutex_unlock;
1709 } 1682 }
1710 1683
1711 /* Negative dentry, just create the file */ 1684 /* Negative dentry, just create the file */
1712 if (!path.dentry->d_inode) { 1685 if (!path->dentry->d_inode) {
1713 /* 1686 /*
1714 * This write is needed to ensure that a 1687 * This write is needed to ensure that a
1715 * ro->rw transition does not occur between 1688 * ro->rw transition does not occur between
@@ -1717,21 +1690,18 @@ do_last:
1717 * a permanent write count is taken through 1690 * a permanent write count is taken through
1718 * the 'struct file' in nameidata_to_filp(). 1691 * the 'struct file' in nameidata_to_filp().
1719 */ 1692 */
1720 error = mnt_want_write(nd.path.mnt); 1693 error = mnt_want_write(nd->path.mnt);
1721 if (error) 1694 if (error)
1722 goto exit_mutex_unlock; 1695 goto exit_mutex_unlock;
1723 error = __open_namei_create(&nd, &path, flag, mode); 1696 error = __open_namei_create(nd, path, open_flag, mode);
1724 if (error) { 1697 if (error) {
1725 mnt_drop_write(nd.path.mnt); 1698 mnt_drop_write(nd->path.mnt);
1726 goto exit; 1699 goto exit;
1727 } 1700 }
1728 filp = nameidata_to_filp(&nd); 1701 filp = nameidata_to_filp(nd);
1729 mnt_drop_write(nd.path.mnt); 1702 mnt_drop_write(nd->path.mnt);
1730 if (nd.root.mnt)
1731 path_put(&nd.root);
1732 if (!IS_ERR(filp)) { 1703 if (!IS_ERR(filp)) {
1733 error = ima_path_check(&filp->f_path, filp->f_mode & 1704 error = ima_file_check(filp, acc_mode);
1734 (MAY_READ | MAY_WRITE | MAY_EXEC));
1735 if (error) { 1705 if (error) {
1736 fput(filp); 1706 fput(filp);
1737 filp = ERR_PTR(error); 1707 filp = ERR_PTR(error);
@@ -1744,157 +1714,182 @@ do_last:
1744 * It already exists. 1714 * It already exists.
1745 */ 1715 */
1746 mutex_unlock(&dir->d_inode->i_mutex); 1716 mutex_unlock(&dir->d_inode->i_mutex);
1747 audit_inode(pathname, path.dentry); 1717 audit_inode(pathname, path->dentry);
1748 1718
1749 error = -EEXIST; 1719 error = -EEXIST;
1750 if (flag & O_EXCL) 1720 if (open_flag & O_EXCL)
1751 goto exit_dput; 1721 goto exit_dput;
1752 1722
1753 if (__follow_mount(&path)) { 1723 if (__follow_mount(path)) {
1754 error = -ELOOP; 1724 error = -ELOOP;
1755 if (flag & O_NOFOLLOW) 1725 if (open_flag & O_NOFOLLOW)
1756 goto exit_dput; 1726 goto exit_dput;
1757 } 1727 }
1758 1728
1759 error = -ENOENT; 1729 error = -ENOENT;
1760 if (!path.dentry->d_inode) 1730 if (!path->dentry->d_inode)
1761 goto exit_dput; 1731 goto exit_dput;
1762 if (path.dentry->d_inode->i_op->follow_link)
1763 goto do_link;
1764 1732
1765 path_to_nameidata(&path, &nd); 1733 if (path->dentry->d_inode->i_op->follow_link)
1734 return NULL;
1735
1736 path_to_nameidata(path, nd);
1766 error = -EISDIR; 1737 error = -EISDIR;
1767 if (S_ISDIR(path.dentry->d_inode->i_mode)) 1738 if (S_ISDIR(path->dentry->d_inode->i_mode))
1768 goto exit; 1739 goto exit;
1769ok: 1740ok:
1770 /* 1741 filp = finish_open(nd, open_flag, acc_mode);
1771 * Consider:
1772 * 1. may_open() truncates a file
1773 * 2. a rw->ro mount transition occurs
1774 * 3. nameidata_to_filp() fails due to
1775 * the ro mount.
1776 * That would be inconsistent, and should
1777 * be avoided. Taking this mnt write here
1778 * ensures that (2) can not occur.
1779 */
1780 will_truncate = open_will_truncate(flag, nd.path.dentry->d_inode);
1781 if (will_truncate) {
1782 error = mnt_want_write(nd.path.mnt);
1783 if (error)
1784 goto exit;
1785 }
1786 error = may_open(&nd.path, acc_mode, flag);
1787 if (error) {
1788 if (will_truncate)
1789 mnt_drop_write(nd.path.mnt);
1790 goto exit;
1791 }
1792 filp = nameidata_to_filp(&nd);
1793 if (!IS_ERR(filp)) {
1794 error = ima_path_check(&filp->f_path, filp->f_mode &
1795 (MAY_READ | MAY_WRITE | MAY_EXEC));
1796 if (error) {
1797 fput(filp);
1798 filp = ERR_PTR(error);
1799 }
1800 }
1801 if (!IS_ERR(filp)) {
1802 if (acc_mode & MAY_WRITE)
1803 vfs_dq_init(nd.path.dentry->d_inode);
1804
1805 if (will_truncate) {
1806 error = handle_truncate(&nd.path);
1807 if (error) {
1808 fput(filp);
1809 filp = ERR_PTR(error);
1810 }
1811 }
1812 }
1813 /*
1814 * It is now safe to drop the mnt write
1815 * because the filp has had a write taken
1816 * on its behalf.
1817 */
1818 if (will_truncate)
1819 mnt_drop_write(nd.path.mnt);
1820 if (nd.root.mnt)
1821 path_put(&nd.root);
1822 return filp; 1742 return filp;
1823 1743
1824exit_mutex_unlock: 1744exit_mutex_unlock:
1825 mutex_unlock(&dir->d_inode->i_mutex); 1745 mutex_unlock(&dir->d_inode->i_mutex);
1826exit_dput: 1746exit_dput:
1827 path_put_conditional(&path, &nd); 1747 path_put_conditional(path, nd);
1828exit: 1748exit:
1829 if (!IS_ERR(nd.intent.open.file)) 1749 if (!IS_ERR(nd->intent.open.file))
1830 release_open_intent(&nd); 1750 release_open_intent(nd);
1831exit_parent: 1751 path_put(&nd->path);
1832 if (nd.root.mnt)
1833 path_put(&nd.root);
1834 path_put(&nd.path);
1835 return ERR_PTR(error); 1752 return ERR_PTR(error);
1753}
1754
1755/*
1756 * Note that the low bits of the passed in "open_flag"
1757 * are not the same as in the local variable "flag". See
1758 * open_to_namei_flags() for more details.
1759 */
1760struct file *do_filp_open(int dfd, const char *pathname,
1761 int open_flag, int mode, int acc_mode)
1762{
1763 struct file *filp;
1764 struct nameidata nd;
1765 int error;
1766 struct path path;
1767 int count = 0;
1768 int flag = open_to_namei_flags(open_flag);
1769 int force_reval = 0;
1770
1771 if (!(open_flag & O_CREAT))
1772 mode = 0;
1836 1773
1837do_link:
1838 error = -ELOOP;
1839 if (flag & O_NOFOLLOW)
1840 goto exit_dput;
1841 /* 1774 /*
1842 * This is subtle. Instead of calling do_follow_link() we do the 1775 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
1843 * thing by hands. The reason is that this way we have zero link_count 1776 * check for O_DSYNC if the need any syncing at all we enforce it's
1844 * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT. 1777 * always set instead of having to deal with possibly weird behaviour
1845 * After that we have the parent and last component, i.e. 1778 * for malicious applications setting only __O_SYNC.
1846 * we are in the same situation as after the first path_walk().
1847 * Well, almost - if the last component is normal we get its copy
1848 * stored in nd->last.name and we will have to putname() it when we
1849 * are done. Procfs-like symlinks just set LAST_BIND.
1850 */ 1779 */
1851 nd.flags |= LOOKUP_PARENT; 1780 if (open_flag & __O_SYNC)
1852 error = security_inode_follow_link(path.dentry, &nd); 1781 open_flag |= O_DSYNC;
1782
1783 if (!acc_mode)
1784 acc_mode = MAY_OPEN | ACC_MODE(open_flag);
1785
1786 /* O_TRUNC implies we need access checks for write permissions */
1787 if (open_flag & O_TRUNC)
1788 acc_mode |= MAY_WRITE;
1789
1790 /* Allow the LSM permission hook to distinguish append
1791 access from general write access. */
1792 if (open_flag & O_APPEND)
1793 acc_mode |= MAY_APPEND;
1794
1795 /* find the parent */
1796reval:
1797 error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
1853 if (error) 1798 if (error)
1854 goto exit_dput; 1799 return ERR_PTR(error);
1855 save = nd.path; 1800 if (force_reval)
1856 path_get(&save);
1857 error = __do_follow_link(&path, &nd);
1858 if (error == -ESTALE) {
1859 /* nd.path had been dropped */
1860 nd.path = save;
1861 path_get(&nd.path);
1862 nd.flags |= LOOKUP_REVAL; 1801 nd.flags |= LOOKUP_REVAL;
1863 error = __do_follow_link(&path, &nd); 1802
1864 } 1803 current->total_link_count = 0;
1865 path_put(&save); 1804 error = link_path_walk(pathname, &nd);
1866 path_put(&path);
1867 if (error) { 1805 if (error) {
1868 /* Does someone understand code flow here? Or it is only 1806 filp = ERR_PTR(error);
1869 * me so stupid? Anathema to whoever designed this non-sense 1807 goto out;
1870 * with "intent.open".
1871 */
1872 release_open_intent(&nd);
1873 if (nd.root.mnt)
1874 path_put(&nd.root);
1875 return ERR_PTR(error);
1876 } 1808 }
1809 if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))
1810 audit_inode(pathname, nd.path.dentry);
1811
1812 /*
1813 * We have the parent and last component.
1814 */
1815
1816 error = -ENFILE;
1817 filp = get_empty_filp();
1818 if (filp == NULL)
1819 goto exit_parent;
1820 nd.intent.open.file = filp;
1821 filp->f_flags = open_flag;
1822 nd.intent.open.flags = flag;
1823 nd.intent.open.create_mode = mode;
1877 nd.flags &= ~LOOKUP_PARENT; 1824 nd.flags &= ~LOOKUP_PARENT;
1878 if (nd.last_type == LAST_BIND) 1825 nd.flags |= LOOKUP_OPEN;
1879 goto ok; 1826 if (open_flag & O_CREAT) {
1880 error = -EISDIR; 1827 nd.flags |= LOOKUP_CREATE;
1881 if (nd.last_type != LAST_NORM) 1828 if (open_flag & O_EXCL)
1882 goto exit; 1829 nd.flags |= LOOKUP_EXCL;
1883 if (nd.last.name[nd.last.len]) { 1830 }
1884 __putname(nd.last.name); 1831 if (open_flag & O_DIRECTORY)
1885 goto exit; 1832 nd.flags |= LOOKUP_DIRECTORY;
1833 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1834 while (unlikely(!filp)) { /* trailing symlink */
1835 struct path holder;
1836 struct inode *inode = path.dentry->d_inode;
1837 void *cookie;
1838 error = -ELOOP;
1839 /* S_ISDIR part is a temporary automount kludge */
1840 if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode))
1841 goto exit_dput;
1842 if (count++ == 32)
1843 goto exit_dput;
1844 /*
1845 * This is subtle. Instead of calling do_follow_link() we do
1846 * the thing by hands. The reason is that this way we have zero
1847 * link_count and path_walk() (called from ->follow_link)
1848 * honoring LOOKUP_PARENT. After that we have the parent and
1849 * last component, i.e. we are in the same situation as after
1850 * the first path_walk(). Well, almost - if the last component
1851 * is normal we get its copy stored in nd->last.name and we will
1852 * have to putname() it when we are done. Procfs-like symlinks
1853 * just set LAST_BIND.
1854 */
1855 nd.flags |= LOOKUP_PARENT;
1856 error = security_inode_follow_link(path.dentry, &nd);
1857 if (error)
1858 goto exit_dput;
1859 error = __do_follow_link(&path, &nd, &cookie);
1860 if (unlikely(error)) {
1861 /* nd.path had been dropped */
1862 if (!IS_ERR(cookie) && inode->i_op->put_link)
1863 inode->i_op->put_link(path.dentry, &nd, cookie);
1864 path_put(&path);
1865 release_open_intent(&nd);
1866 filp = ERR_PTR(error);
1867 goto out;
1868 }
1869 holder = path;
1870 nd.flags &= ~LOOKUP_PARENT;
1871 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1872 if (inode->i_op->put_link)
1873 inode->i_op->put_link(holder.dentry, &nd, cookie);
1874 path_put(&holder);
1886 } 1875 }
1887 error = -ELOOP; 1876out:
1888 if (count++==32) { 1877 if (nd.root.mnt)
1889 __putname(nd.last.name); 1878 path_put(&nd.root);
1890 goto exit; 1879 if (filp == ERR_PTR(-ESTALE) && !force_reval) {
1880 force_reval = 1;
1881 goto reval;
1891 } 1882 }
1892 dir = nd.path.dentry; 1883 return filp;
1893 mutex_lock(&dir->d_inode->i_mutex); 1884
1894 path.dentry = lookup_hash(&nd); 1885exit_dput:
1895 path.mnt = nd.path.mnt; 1886 path_put_conditional(&path, &nd);
1896 __putname(nd.last.name); 1887 if (!IS_ERR(nd.intent.open.file))
1897 goto do_last; 1888 release_open_intent(&nd);
1889exit_parent:
1890 path_put(&nd.path);
1891 filp = ERR_PTR(error);
1892 goto out;
1898} 1893}
1899 1894
1900/** 1895/**
@@ -1988,7 +1983,6 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1988 if (error) 1983 if (error)
1989 return error; 1984 return error;
1990 1985
1991 vfs_dq_init(dir);
1992 error = dir->i_op->mknod(dir, dentry, mode, dev); 1986 error = dir->i_op->mknod(dir, dentry, mode, dev);
1993 if (!error) 1987 if (!error)
1994 fsnotify_create(dir, dentry); 1988 fsnotify_create(dir, dentry);
@@ -2087,7 +2081,6 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2087 if (error) 2081 if (error)
2088 return error; 2082 return error;
2089 2083
2090 vfs_dq_init(dir);
2091 error = dir->i_op->mkdir(dir, dentry, mode); 2084 error = dir->i_op->mkdir(dir, dentry, mode);
2092 if (!error) 2085 if (!error)
2093 fsnotify_mkdir(dir, dentry); 2086 fsnotify_mkdir(dir, dentry);
@@ -2173,8 +2166,6 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2173 if (!dir->i_op->rmdir) 2166 if (!dir->i_op->rmdir)
2174 return -EPERM; 2167 return -EPERM;
2175 2168
2176 vfs_dq_init(dir);
2177
2178 mutex_lock(&dentry->d_inode->i_mutex); 2169 mutex_lock(&dentry->d_inode->i_mutex);
2179 dentry_unhash(dentry); 2170 dentry_unhash(dentry);
2180 if (d_mountpoint(dentry)) 2171 if (d_mountpoint(dentry))
@@ -2260,15 +2251,16 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
2260 if (!dir->i_op->unlink) 2251 if (!dir->i_op->unlink)
2261 return -EPERM; 2252 return -EPERM;
2262 2253
2263 vfs_dq_init(dir);
2264
2265 mutex_lock(&dentry->d_inode->i_mutex); 2254 mutex_lock(&dentry->d_inode->i_mutex);
2266 if (d_mountpoint(dentry)) 2255 if (d_mountpoint(dentry))
2267 error = -EBUSY; 2256 error = -EBUSY;
2268 else { 2257 else {
2269 error = security_inode_unlink(dir, dentry); 2258 error = security_inode_unlink(dir, dentry);
2270 if (!error) 2259 if (!error) {
2271 error = dir->i_op->unlink(dir, dentry); 2260 error = dir->i_op->unlink(dir, dentry);
2261 if (!error)
2262 dentry->d_inode->i_flags |= S_DEAD;
2263 }
2272 } 2264 }
2273 mutex_unlock(&dentry->d_inode->i_mutex); 2265 mutex_unlock(&dentry->d_inode->i_mutex);
2274 2266
@@ -2371,7 +2363,6 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
2371 if (error) 2363 if (error)
2372 return error; 2364 return error;
2373 2365
2374 vfs_dq_init(dir);
2375 error = dir->i_op->symlink(dir, dentry, oldname); 2366 error = dir->i_op->symlink(dir, dentry, oldname);
2376 if (!error) 2367 if (!error)
2377 fsnotify_create(dir, dentry); 2368 fsnotify_create(dir, dentry);
@@ -2455,7 +2446,6 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
2455 return error; 2446 return error;
2456 2447
2457 mutex_lock(&inode->i_mutex); 2448 mutex_lock(&inode->i_mutex);
2458 vfs_dq_init(dir);
2459 error = dir->i_op->link(old_dentry, dir, new_dentry); 2449 error = dir->i_op->link(old_dentry, dir, new_dentry);
2460 mutex_unlock(&inode->i_mutex); 2450 mutex_unlock(&inode->i_mutex);
2461 if (!error) 2451 if (!error)
@@ -2556,7 +2546,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
2556 * e) conversion from fhandle to dentry may come in the wrong moment - when 2546 * e) conversion from fhandle to dentry may come in the wrong moment - when
2557 * we are removing the target. Solution: we will have to grab ->i_mutex 2547 * we are removing the target. Solution: we will have to grab ->i_mutex
2558 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on 2548 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
2559 * ->i_mutex on parents, which works but leads to some truely excessive 2549 * ->i_mutex on parents, which works but leads to some truly excessive
2560 * locking]. 2550 * locking].
2561 */ 2551 */
2562static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, 2552static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
@@ -2621,6 +2611,8 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2621 else 2611 else
2622 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 2612 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2623 if (!error) { 2613 if (!error) {
2614 if (target)
2615 target->i_flags |= S_DEAD;
2624 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 2616 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2625 d_move(old_dentry, new_dentry); 2617 d_move(old_dentry, new_dentry);
2626 } 2618 }
@@ -2654,20 +2646,15 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2654 if (!old_dir->i_op->rename) 2646 if (!old_dir->i_op->rename)
2655 return -EPERM; 2647 return -EPERM;
2656 2648
2657 vfs_dq_init(old_dir);
2658 vfs_dq_init(new_dir);
2659
2660 old_name = fsnotify_oldname_init(old_dentry->d_name.name); 2649 old_name = fsnotify_oldname_init(old_dentry->d_name.name);
2661 2650
2662 if (is_dir) 2651 if (is_dir)
2663 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); 2652 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
2664 else 2653 else
2665 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); 2654 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
2666 if (!error) { 2655 if (!error)
2667 const char *new_name = old_dentry->d_name.name; 2656 fsnotify_move(old_dir, new_dir, old_name, is_dir,
2668 fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir,
2669 new_dentry->d_inode, old_dentry); 2657 new_dentry->d_inode, old_dentry);
2670 }
2671 fsnotify_oldname_free(old_name); 2658 fsnotify_oldname_free(old_name);
2672 2659
2673 return error; 2660 return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index 7d70d63ceb29..8174c8ab5c70 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -573,7 +573,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
573 mnt->mnt_master = old; 573 mnt->mnt_master = old;
574 CLEAR_MNT_SHARED(mnt); 574 CLEAR_MNT_SHARED(mnt);
575 } else if (!(flag & CL_PRIVATE)) { 575 } else if (!(flag & CL_PRIVATE)) {
576 if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old)) 576 if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
577 list_add(&mnt->mnt_share, &old->mnt_share); 577 list_add(&mnt->mnt_share, &old->mnt_share);
578 if (IS_MNT_SLAVE(old)) 578 if (IS_MNT_SLAVE(old))
579 list_add(&mnt->mnt_slave, &old->mnt_slave); 579 list_add(&mnt->mnt_slave, &old->mnt_slave);
@@ -737,6 +737,21 @@ static void m_stop(struct seq_file *m, void *v)
737 up_read(&namespace_sem); 737 up_read(&namespace_sem);
738} 738}
739 739
740int mnt_had_events(struct proc_mounts *p)
741{
742 struct mnt_namespace *ns = p->ns;
743 int res = 0;
744
745 spin_lock(&vfsmount_lock);
746 if (p->event != ns->event) {
747 p->event = ns->event;
748 res = 1;
749 }
750 spin_unlock(&vfsmount_lock);
751
752 return res;
753}
754
740struct proc_fs_info { 755struct proc_fs_info {
741 int flag; 756 int flag;
742 const char *str; 757 const char *str;
@@ -965,10 +980,12 @@ EXPORT_SYMBOL(may_umount_tree);
965int may_umount(struct vfsmount *mnt) 980int may_umount(struct vfsmount *mnt)
966{ 981{
967 int ret = 1; 982 int ret = 1;
983 down_read(&namespace_sem);
968 spin_lock(&vfsmount_lock); 984 spin_lock(&vfsmount_lock);
969 if (propagate_mount_busy(mnt, 2)) 985 if (propagate_mount_busy(mnt, 2))
970 ret = 0; 986 ret = 0;
971 spin_unlock(&vfsmount_lock); 987 spin_unlock(&vfsmount_lock);
988 up_read(&namespace_sem);
972 return ret; 989 return ret;
973} 990}
974 991
@@ -1119,8 +1136,15 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1119{ 1136{
1120 struct path path; 1137 struct path path;
1121 int retval; 1138 int retval;
1139 int lookup_flags = 0;
1140
1141 if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
1142 return -EINVAL;
1143
1144 if (!(flags & UMOUNT_NOFOLLOW))
1145 lookup_flags |= LOOKUP_FOLLOW;
1122 1146
1123 retval = user_path(name, &path); 1147 retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
1124 if (retval) 1148 if (retval)
1125 goto out; 1149 goto out;
1126 retval = -EINVAL; 1150 retval = -EINVAL;
@@ -1244,6 +1268,21 @@ void drop_collected_mounts(struct vfsmount *mnt)
1244 release_mounts(&umount_list); 1268 release_mounts(&umount_list);
1245} 1269}
1246 1270
1271int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
1272 struct vfsmount *root)
1273{
1274 struct vfsmount *mnt;
1275 int res = f(root, arg);
1276 if (res)
1277 return res;
1278 list_for_each_entry(mnt, &root->mnt_list, mnt_list) {
1279 res = f(mnt, arg);
1280 if (res)
1281 return res;
1282 }
1283 return 0;
1284}
1285
1247static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) 1286static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end)
1248{ 1287{
1249 struct vfsmount *p; 1288 struct vfsmount *p;
@@ -1352,12 +1391,12 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
1352 if (err) 1391 if (err)
1353 goto out_cleanup_ids; 1392 goto out_cleanup_ids;
1354 1393
1394 spin_lock(&vfsmount_lock);
1395
1355 if (IS_MNT_SHARED(dest_mnt)) { 1396 if (IS_MNT_SHARED(dest_mnt)) {
1356 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1397 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
1357 set_mnt_shared(p); 1398 set_mnt_shared(p);
1358 } 1399 }
1359
1360 spin_lock(&vfsmount_lock);
1361 if (parent_path) { 1400 if (parent_path) {
1362 detach_mnt(source_mnt, parent_path); 1401 detach_mnt(source_mnt, parent_path);
1363 attach_mnt(source_mnt, path); 1402 attach_mnt(source_mnt, path);
@@ -1534,8 +1573,12 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
1534 err = change_mount_flags(path->mnt, flags); 1573 err = change_mount_flags(path->mnt, flags);
1535 else 1574 else
1536 err = do_remount_sb(sb, flags, data, 0); 1575 err = do_remount_sb(sb, flags, data, 0);
1537 if (!err) 1576 if (!err) {
1577 spin_lock(&vfsmount_lock);
1578 mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
1538 path->mnt->mnt_flags = mnt_flags; 1579 path->mnt->mnt_flags = mnt_flags;
1580 spin_unlock(&vfsmount_lock);
1581 }
1539 up_write(&sb->s_umount); 1582 up_write(&sb->s_umount);
1540 if (!err) { 1583 if (!err) {
1541 security_sb_post_remount(path->mnt, flags, data); 1584 security_sb_post_remount(path->mnt, flags, data);
@@ -1665,6 +1708,8 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
1665{ 1708{
1666 int err; 1709 int err;
1667 1710
1711 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
1712
1668 down_write(&namespace_sem); 1713 down_write(&namespace_sem);
1669 /* Something was mounted here while we slept */ 1714 /* Something was mounted here while we slept */
1670 while (d_mountpoint(path->dentry) && 1715 while (d_mountpoint(path->dentry) &&
@@ -2306,17 +2351,13 @@ void __init mnt_init(void)
2306 2351
2307void put_mnt_ns(struct mnt_namespace *ns) 2352void put_mnt_ns(struct mnt_namespace *ns)
2308{ 2353{
2309 struct vfsmount *root;
2310 LIST_HEAD(umount_list); 2354 LIST_HEAD(umount_list);
2311 2355
2312 if (!atomic_dec_and_lock(&ns->count, &vfsmount_lock)) 2356 if (!atomic_dec_and_test(&ns->count))
2313 return; 2357 return;
2314 root = ns->root;
2315 ns->root = NULL;
2316 spin_unlock(&vfsmount_lock);
2317 down_write(&namespace_sem); 2358 down_write(&namespace_sem);
2318 spin_lock(&vfsmount_lock); 2359 spin_lock(&vfsmount_lock);
2319 umount_tree(root, 0, &umount_list); 2360 umount_tree(ns->root, 0, &umount_list);
2320 spin_unlock(&vfsmount_lock); 2361 spin_unlock(&vfsmount_lock);
2321 up_write(&namespace_sem); 2362 up_write(&namespace_sem);
2322 release_mounts(&umount_list); 2363 release_mounts(&umount_list);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index b8b5b30d53f0..7edfcd4d5e52 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -15,7 +15,6 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/slab.h>
19#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
20#include <linux/mm.h> 19#include <linux/mm.h>
21#include <asm/uaccess.h> 20#include <asm/uaccess.h>
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6a7d901f1936..1daabb90e0a5 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -15,7 +15,6 @@
15#include <linux/fcntl.h> 15#include <linux/fcntl.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/slab.h>
19#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
20#include <linux/sched.h> 19#include <linux/sched.h>
21#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index cf98da1be23e..fa3385154023 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -526,10 +526,15 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
526 sb->s_blocksize_bits = 10; 526 sb->s_blocksize_bits = 10;
527 sb->s_magic = NCP_SUPER_MAGIC; 527 sb->s_magic = NCP_SUPER_MAGIC;
528 sb->s_op = &ncp_sops; 528 sb->s_op = &ncp_sops;
529 sb->s_bdi = &server->bdi;
529 530
530 server = NCP_SBP(sb); 531 server = NCP_SBP(sb);
531 memset(server, 0, sizeof(*server)); 532 memset(server, 0, sizeof(*server));
532 533
534 error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
535 if (error)
536 goto out_bdi;
537
533 server->ncp_filp = ncp_filp; 538 server->ncp_filp = ncp_filp;
534 server->ncp_sock = sock; 539 server->ncp_sock = sock;
535 540
@@ -719,6 +724,8 @@ out_fput2:
719 if (server->info_filp) 724 if (server->info_filp)
720 fput(server->info_filp); 725 fput(server->info_filp);
721out_fput: 726out_fput:
727 bdi_destroy(&server->bdi);
728out_bdi:
722 /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: 729 /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
723 * 730 *
724 * The previously used put_filp(ncp_filp); was bogous, since 731 * The previously used put_filp(ncp_filp); was bogous, since
@@ -756,6 +763,7 @@ static void ncp_put_super(struct super_block *sb)
756 kill_pid(server->m.wdog_pid, SIGTERM, 1); 763 kill_pid(server->m.wdog_pid, SIGTERM, 1);
757 put_pid(server->m.wdog_pid); 764 put_pid(server->m.wdog_pid);
758 765
766 bdi_destroy(&server->bdi);
759 kfree(server->priv.data); 767 kfree(server->priv.data);
760 kfree(server->auth.object_name); 768 kfree(server->auth.object_name);
761 vfree(server->rxbuf); 769 vfree(server->rxbuf);
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index ec8f45f12e05..60a5e2864ea8 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -15,6 +15,7 @@
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/slab.h>
18#include <linux/highuid.h> 19#include <linux/highuid.h>
19#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
20#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 15458decdb8a..56f5b3a0e1ee 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -9,12 +9,12 @@
9#include <linux/stat.h> 9#include <linux/stat.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/gfp.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
13#include <linux/shm.h> 14#include <linux/shm.h>
14#include <linux/errno.h> 15#include <linux/errno.h>
15#include <linux/mman.h> 16#include <linux/mman.h>
16#include <linux/string.h> 17#include <linux/string.h>
17#include <linux/slab.h>
18#include <linux/fcntl.h> 18#include <linux/fcntl.h>
19#include <linux/ncp_fs.h> 19#include <linux/ncp_fs.h>
20 20
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index e37df8d5fe70..c7ff6c700a6e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -21,6 +21,7 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/netdevice.h> 22#include <linux/netdevice.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/slab.h>
24#include <net/scm.h> 25#include <net/scm.h>
25#include <net/sock.h> 26#include <net/sock.h>
26#include <linux/ipx.h> 27#include <linux/ipx.h>
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index e3d26c1bd105..c634fd17b337 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -27,6 +27,7 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/ncp_fs.h> 28#include <linux/ncp_fs.h>
29#include <linux/time.h> 29#include <linux/time.h>
30#include <linux/slab.h>
30#include <linux/mm.h> 31#include <linux/mm.h>
31#include <linux/stat.h> 32#include <linux/stat.h>
32#include "ncplib_kernel.h" 33#include "ncplib_kernel.h"
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 59e5673b4597..a43d07e7b924 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -95,8 +95,7 @@ config ROOT_NFS
95 Most people say N here. 95 Most people say N here.
96 96
97config NFS_FSCACHE 97config NFS_FSCACHE
98 bool "Provide NFS client caching support (EXPERIMENTAL)" 98 bool "Provide NFS client caching support"
99 depends on EXPERIMENTAL
100 depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y 99 depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
101 help 100 help
102 Say Y here if you want NFS data to be cached locally on disc through 101 Say Y here if you want NFS data to be cached locally on disc through
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index b4ffd0146ea6..84690319e625 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -10,6 +10,7 @@
10#include <linux/moduleparam.h> 10#include <linux/moduleparam.h>
11#include <linux/mount.h> 11#include <linux/mount.h>
12#include <linux/namei.h> 12#include <linux/namei.h>
13#include <linux/slab.h>
13#include <linux/sunrpc/cache.h> 14#include <linux/sunrpc/cache.h>
14#include <linux/sunrpc/rpc_pipe_fs.h> 15#include <linux/sunrpc/rpc_pipe_fs.h>
15 16
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 73ab220354df..36dfdae95123 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -118,7 +118,6 @@ nfs4_callback_up(struct svc_serv *serv)
118 dprintk("NFS: Callback listener port = %u (af %u)\n", 118 dprintk("NFS: Callback listener port = %u (af %u)\n",
119 nfs_callback_tcpport, PF_INET); 119 nfs_callback_tcpport, PF_INET);
120 120
121#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
122 ret = svc_create_xprt(serv, "tcp", PF_INET6, 121 ret = svc_create_xprt(serv, "tcp", PF_INET6,
123 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); 122 nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
124 if (ret > 0) { 123 if (ret > 0) {
@@ -129,7 +128,6 @@ nfs4_callback_up(struct svc_serv *serv)
129 ret = 0; 128 ret = 0;
130 else 129 else
131 goto out_err; 130 goto out_err;
132#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
133 131
134 return svc_prepare_thread(serv, &serv->sv_pools[0]); 132 return svc_prepare_thread(serv, &serv->sv_pools[0]);
135 133
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index d4036be0b589..85a7cfd1b8dd 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -119,6 +119,14 @@ struct cb_recallanyargs {
119}; 119};
120 120
121extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy); 121extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy);
122
123struct cb_recallslotargs {
124 struct sockaddr *crsa_addr;
125 uint32_t crsa_target_max_slots;
126};
127extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args,
128 void *dummy);
129
122#endif /* CONFIG_NFS_V4_1 */ 130#endif /* CONFIG_NFS_V4_1 */
123 131
124extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); 132extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index defa9b4c470e..a08770a7e857 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -7,6 +7,7 @@
7 */ 7 */
8#include <linux/nfs4.h> 8#include <linux/nfs4.h>
9#include <linux/nfs_fs.h> 9#include <linux/nfs_fs.h>
10#include <linux/slab.h>
10#include "nfs4_fs.h" 11#include "nfs4_fs.h"
11#include "callback.h" 12#include "callback.h"
12#include "delegation.h" 13#include "delegation.h"
@@ -143,44 +144,49 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n
143 * Return success if the sequenceID is one more than what we last saw on 144 * Return success if the sequenceID is one more than what we last saw on
144 * this slot, accounting for wraparound. Increments the slot's sequence. 145 * this slot, accounting for wraparound. Increments the slot's sequence.
145 * 146 *
146 * We don't yet implement a duplicate request cache, so at this time 147 * We don't yet implement a duplicate request cache, instead we set the
147 * we will log replays, and process them as if we had not seen them before, 148 * back channel ca_maxresponsesize_cached to zero. This is OK for now
148 * but we don't bump the sequence in the slot. Not too worried about it,
149 * since we only currently implement idempotent callbacks anyway. 149 * since we only currently implement idempotent callbacks anyway.
150 * 150 *
151 * We have a single slot backchannel at this time, so we don't bother 151 * We have a single slot backchannel at this time, so we don't bother
152 * checking the used_slots bit array on the table. The lower layer guarantees 152 * checking the used_slots bit array on the table. The lower layer guarantees
153 * a single outstanding callback request at a time. 153 * a single outstanding callback request at a time.
154 */ 154 */
155static int 155static __be32
156validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid) 156validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
157{ 157{
158 struct nfs4_slot *slot; 158 struct nfs4_slot *slot;
159 159
160 dprintk("%s enter. slotid %d seqid %d\n", 160 dprintk("%s enter. slotid %d seqid %d\n",
161 __func__, slotid, seqid); 161 __func__, args->csa_slotid, args->csa_sequenceid);
162 162
163 if (slotid > NFS41_BC_MAX_CALLBACKS) 163 if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS)
164 return htonl(NFS4ERR_BADSLOT); 164 return htonl(NFS4ERR_BADSLOT);
165 165
166 slot = tbl->slots + slotid; 166 slot = tbl->slots + args->csa_slotid;
167 dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr); 167 dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr);
168 168
169 /* Normal */ 169 /* Normal */
170 if (likely(seqid == slot->seq_nr + 1)) { 170 if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
171 slot->seq_nr++; 171 slot->seq_nr++;
172 return htonl(NFS4_OK); 172 return htonl(NFS4_OK);
173 } 173 }
174 174
175 /* Replay */ 175 /* Replay */
176 if (seqid == slot->seq_nr) { 176 if (args->csa_sequenceid == slot->seq_nr) {
177 dprintk("%s seqid %d is a replay - no DRC available\n", 177 dprintk("%s seqid %d is a replay\n",
178 __func__, seqid); 178 __func__, args->csa_sequenceid);
179 return htonl(NFS4_OK); 179 /* Signal process_op to set this error on next op */
180 if (args->csa_cachethis == 0)
181 return htonl(NFS4ERR_RETRY_UNCACHED_REP);
182
183 /* The ca_maxresponsesize_cached is 0 with no DRC */
184 else if (args->csa_cachethis == 1)
185 return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
180 } 186 }
181 187
182 /* Wraparound */ 188 /* Wraparound */
183 if (seqid == 1 && (slot->seq_nr + 1) == 0) { 189 if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
184 slot->seq_nr = 1; 190 slot->seq_nr = 1;
185 return htonl(NFS4_OK); 191 return htonl(NFS4_OK);
186 } 192 }
@@ -225,27 +231,87 @@ validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid)
225 return NULL; 231 return NULL;
226} 232}
227 233
228/* FIXME: referring calls should be processed */ 234/*
229unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, 235 * For each referring call triple, check the session's slot table for
236 * a match. If the slot is in use and the sequence numbers match, the
237 * client is still waiting for a response to the original request.
238 */
239static bool referring_call_exists(struct nfs_client *clp,
240 uint32_t nrclists,
241 struct referring_call_list *rclists)
242{
243 bool status = 0;
244 int i, j;
245 struct nfs4_session *session;
246 struct nfs4_slot_table *tbl;
247 struct referring_call_list *rclist;
248 struct referring_call *ref;
249
250 /*
251 * XXX When client trunking is implemented, this becomes
252 * a session lookup from within the loop
253 */
254 session = clp->cl_session;
255 tbl = &session->fc_slot_table;
256
257 for (i = 0; i < nrclists; i++) {
258 rclist = &rclists[i];
259 if (memcmp(session->sess_id.data,
260 rclist->rcl_sessionid.data,
261 NFS4_MAX_SESSIONID_LEN) != 0)
262 continue;
263
264 for (j = 0; j < rclist->rcl_nrefcalls; j++) {
265 ref = &rclist->rcl_refcalls[j];
266
267 dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u "
268 "slotid %u\n", __func__,
269 ((u32 *)&rclist->rcl_sessionid.data)[0],
270 ((u32 *)&rclist->rcl_sessionid.data)[1],
271 ((u32 *)&rclist->rcl_sessionid.data)[2],
272 ((u32 *)&rclist->rcl_sessionid.data)[3],
273 ref->rc_sequenceid, ref->rc_slotid);
274
275 spin_lock(&tbl->slot_tbl_lock);
276 status = (test_bit(ref->rc_slotid, tbl->used_slots) &&
277 tbl->slots[ref->rc_slotid].seq_nr ==
278 ref->rc_sequenceid);
279 spin_unlock(&tbl->slot_tbl_lock);
280 if (status)
281 goto out;
282 }
283 }
284
285out:
286 return status;
287}
288
289__be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
230 struct cb_sequenceres *res) 290 struct cb_sequenceres *res)
231{ 291{
232 struct nfs_client *clp; 292 struct nfs_client *clp;
233 int i, status; 293 int i;
234 294 __be32 status;
235 for (i = 0; i < args->csa_nrclists; i++)
236 kfree(args->csa_rclists[i].rcl_refcalls);
237 kfree(args->csa_rclists);
238 295
239 status = htonl(NFS4ERR_BADSESSION); 296 status = htonl(NFS4ERR_BADSESSION);
240 clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid); 297 clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid);
241 if (clp == NULL) 298 if (clp == NULL)
242 goto out; 299 goto out;
243 300
244 status = validate_seqid(&clp->cl_session->bc_slot_table, 301 status = validate_seqid(&clp->cl_session->bc_slot_table, args);
245 args->csa_slotid, args->csa_sequenceid);
246 if (status) 302 if (status)
247 goto out_putclient; 303 goto out_putclient;
248 304
305 /*
306 * Check for pending referring calls. If a match is found, a
307 * related callback was received before the response to the original
308 * call.
309 */
310 if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
311 status = htonl(NFS4ERR_DELAY);
312 goto out_putclient;
313 }
314
249 memcpy(&res->csr_sessionid, &args->csa_sessionid, 315 memcpy(&res->csr_sessionid, &args->csa_sessionid,
250 sizeof(res->csr_sessionid)); 316 sizeof(res->csr_sessionid));
251 res->csr_sequenceid = args->csa_sequenceid; 317 res->csr_sequenceid = args->csa_sequenceid;
@@ -256,15 +322,23 @@ unsigned nfs4_callback_sequence(struct cb_sequenceargs *args,
256out_putclient: 322out_putclient:
257 nfs_put_client(clp); 323 nfs_put_client(clp);
258out: 324out:
259 dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); 325 for (i = 0; i < args->csa_nrclists; i++)
260 res->csr_status = status; 326 kfree(args->csa_rclists[i].rcl_refcalls);
261 return res->csr_status; 327 kfree(args->csa_rclists);
328
329 if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP))
330 res->csr_status = 0;
331 else
332 res->csr_status = status;
333 dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,
334 ntohl(status), ntohl(res->csr_status));
335 return status;
262} 336}
263 337
264unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) 338__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy)
265{ 339{
266 struct nfs_client *clp; 340 struct nfs_client *clp;
267 int status; 341 __be32 status;
268 fmode_t flags = 0; 342 fmode_t flags = 0;
269 343
270 status = htonl(NFS4ERR_OP_NOT_IN_SESSION); 344 status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
@@ -289,4 +363,40 @@ out:
289 dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); 363 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
290 return status; 364 return status;
291} 365}
366
367/* Reduce the fore channel's max_slots to the target value */
368__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy)
369{
370 struct nfs_client *clp;
371 struct nfs4_slot_table *fc_tbl;
372 __be32 status;
373
374 status = htonl(NFS4ERR_OP_NOT_IN_SESSION);
375 clp = nfs_find_client(args->crsa_addr, 4);
376 if (clp == NULL)
377 goto out;
378
379 dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",
380 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
381 args->crsa_target_max_slots);
382
383 fc_tbl = &clp->cl_session->fc_slot_table;
384
385 status = htonl(NFS4ERR_BAD_HIGH_SLOT);
386 if (args->crsa_target_max_slots > fc_tbl->max_slots ||
387 args->crsa_target_max_slots < 1)
388 goto out_putclient;
389
390 status = htonl(NFS4_OK);
391 if (args->crsa_target_max_slots == fc_tbl->max_slots)
392 goto out_putclient;
393
394 fc_tbl->target_max_slots = args->crsa_target_max_slots;
395 nfs41_handle_recall_slot(clp);
396out_putclient:
397 nfs_put_client(clp); /* balance nfs_find_client */
398out:
399 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
400 return status;
401}
292#endif /* CONFIG_NFS_V4_1 */ 402#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 8e1a2511c8be..05af212f0edf 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -9,6 +9,7 @@
9#include <linux/sunrpc/svc.h> 9#include <linux/sunrpc/svc.h>
10#include <linux/nfs4.h> 10#include <linux/nfs4.h>
11#include <linux/nfs_fs.h> 11#include <linux/nfs_fs.h>
12#include <linux/slab.h>
12#include "nfs4_fs.h" 13#include "nfs4_fs.h"
13#include "callback.h" 14#include "callback.h"
14 15
@@ -24,10 +25,14 @@
24#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ 25#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
25 4 + 1 + 3) 26 4 + 1 + 3)
26#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) 27#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
28#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
27#endif /* CONFIG_NFS_V4_1 */ 29#endif /* CONFIG_NFS_V4_1 */
28 30
29#define NFSDBG_FACILITY NFSDBG_CALLBACK 31#define NFSDBG_FACILITY NFSDBG_CALLBACK
30 32
33/* Internal error code */
34#define NFS4ERR_RESOURCE_HDR 11050
35
31typedef __be32 (*callback_process_op_t)(void *, void *); 36typedef __be32 (*callback_process_op_t)(void *, void *);
32typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *); 37typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
33typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *); 38typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
@@ -173,7 +178,7 @@ static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
173 __be32 *p; 178 __be32 *p;
174 p = read_buf(xdr, 4); 179 p = read_buf(xdr, 4);
175 if (unlikely(p == NULL)) 180 if (unlikely(p == NULL))
176 return htonl(NFS4ERR_RESOURCE); 181 return htonl(NFS4ERR_RESOURCE_HDR);
177 *op = ntohl(*p); 182 *op = ntohl(*p);
178 return 0; 183 return 0;
179} 184}
@@ -215,10 +220,10 @@ out:
215 220
216#if defined(CONFIG_NFS_V4_1) 221#if defined(CONFIG_NFS_V4_1)
217 222
218static unsigned decode_sessionid(struct xdr_stream *xdr, 223static __be32 decode_sessionid(struct xdr_stream *xdr,
219 struct nfs4_sessionid *sid) 224 struct nfs4_sessionid *sid)
220{ 225{
221 uint32_t *p; 226 __be32 *p;
222 int len = NFS4_MAX_SESSIONID_LEN; 227 int len = NFS4_MAX_SESSIONID_LEN;
223 228
224 p = read_buf(xdr, len); 229 p = read_buf(xdr, len);
@@ -229,12 +234,12 @@ static unsigned decode_sessionid(struct xdr_stream *xdr,
229 return 0; 234 return 0;
230} 235}
231 236
232static unsigned decode_rc_list(struct xdr_stream *xdr, 237static __be32 decode_rc_list(struct xdr_stream *xdr,
233 struct referring_call_list *rc_list) 238 struct referring_call_list *rc_list)
234{ 239{
235 uint32_t *p; 240 __be32 *p;
236 int i; 241 int i;
237 unsigned status; 242 __be32 status;
238 243
239 status = decode_sessionid(xdr, &rc_list->rcl_sessionid); 244 status = decode_sessionid(xdr, &rc_list->rcl_sessionid);
240 if (status) 245 if (status)
@@ -267,13 +272,13 @@ out:
267 return status; 272 return status;
268} 273}
269 274
270static unsigned decode_cb_sequence_args(struct svc_rqst *rqstp, 275static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
271 struct xdr_stream *xdr, 276 struct xdr_stream *xdr,
272 struct cb_sequenceargs *args) 277 struct cb_sequenceargs *args)
273{ 278{
274 uint32_t *p; 279 __be32 *p;
275 int i; 280 int i;
276 unsigned status; 281 __be32 status;
277 282
278 status = decode_sessionid(xdr, &args->csa_sessionid); 283 status = decode_sessionid(xdr, &args->csa_sessionid);
279 if (status) 284 if (status)
@@ -327,11 +332,11 @@ out_free:
327 goto out; 332 goto out;
328} 333}
329 334
330static unsigned decode_recallany_args(struct svc_rqst *rqstp, 335static __be32 decode_recallany_args(struct svc_rqst *rqstp,
331 struct xdr_stream *xdr, 336 struct xdr_stream *xdr,
332 struct cb_recallanyargs *args) 337 struct cb_recallanyargs *args)
333{ 338{
334 uint32_t *p; 339 __be32 *p;
335 340
336 args->craa_addr = svc_addr(rqstp); 341 args->craa_addr = svc_addr(rqstp);
337 p = read_buf(xdr, 4); 342 p = read_buf(xdr, 4);
@@ -346,6 +351,20 @@ static unsigned decode_recallany_args(struct svc_rqst *rqstp,
346 return 0; 351 return 0;
347} 352}
348 353
354static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
355 struct xdr_stream *xdr,
356 struct cb_recallslotargs *args)
357{
358 __be32 *p;
359
360 args->crsa_addr = svc_addr(rqstp);
361 p = read_buf(xdr, 4);
362 if (unlikely(p == NULL))
363 return htonl(NFS4ERR_BADXDR);
364 args->crsa_target_max_slots = ntohl(*p++);
365 return 0;
366}
367
349#endif /* CONFIG_NFS_V4_1 */ 368#endif /* CONFIG_NFS_V4_1 */
350 369
351static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) 370static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
@@ -465,7 +484,7 @@ static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res)
465 484
466 p = xdr_reserve_space(xdr, 8); 485 p = xdr_reserve_space(xdr, 8);
467 if (unlikely(p == NULL)) 486 if (unlikely(p == NULL))
468 return htonl(NFS4ERR_RESOURCE); 487 return htonl(NFS4ERR_RESOURCE_HDR);
469 *p++ = htonl(op); 488 *p++ = htonl(op);
470 *p = res; 489 *p = res;
471 return 0; 490 return 0;
@@ -499,10 +518,10 @@ out:
499 518
500#if defined(CONFIG_NFS_V4_1) 519#if defined(CONFIG_NFS_V4_1)
501 520
502static unsigned encode_sessionid(struct xdr_stream *xdr, 521static __be32 encode_sessionid(struct xdr_stream *xdr,
503 const struct nfs4_sessionid *sid) 522 const struct nfs4_sessionid *sid)
504{ 523{
505 uint32_t *p; 524 __be32 *p;
506 int len = NFS4_MAX_SESSIONID_LEN; 525 int len = NFS4_MAX_SESSIONID_LEN;
507 526
508 p = xdr_reserve_space(xdr, len); 527 p = xdr_reserve_space(xdr, len);
@@ -513,11 +532,11 @@ static unsigned encode_sessionid(struct xdr_stream *xdr,
513 return 0; 532 return 0;
514} 533}
515 534
516static unsigned encode_cb_sequence_res(struct svc_rqst *rqstp, 535static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
517 struct xdr_stream *xdr, 536 struct xdr_stream *xdr,
518 const struct cb_sequenceres *res) 537 const struct cb_sequenceres *res)
519{ 538{
520 uint32_t *p; 539 __be32 *p;
521 unsigned status = res->csr_status; 540 unsigned status = res->csr_status;
522 541
523 if (unlikely(status != 0)) 542 if (unlikely(status != 0))
@@ -554,6 +573,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
554 case OP_CB_RECALL: 573 case OP_CB_RECALL:
555 case OP_CB_SEQUENCE: 574 case OP_CB_SEQUENCE:
556 case OP_CB_RECALL_ANY: 575 case OP_CB_RECALL_ANY:
576 case OP_CB_RECALL_SLOT:
557 *op = &callback_ops[op_nr]; 577 *op = &callback_ops[op_nr];
558 break; 578 break;
559 579
@@ -562,7 +582,6 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
562 case OP_CB_NOTIFY: 582 case OP_CB_NOTIFY:
563 case OP_CB_PUSH_DELEG: 583 case OP_CB_PUSH_DELEG:
564 case OP_CB_RECALLABLE_OBJ_AVAIL: 584 case OP_CB_RECALLABLE_OBJ_AVAIL:
565 case OP_CB_RECALL_SLOT:
566 case OP_CB_WANTS_CANCELLED: 585 case OP_CB_WANTS_CANCELLED:
567 case OP_CB_NOTIFY_LOCK: 586 case OP_CB_NOTIFY_LOCK:
568 return htonl(NFS4ERR_NOTSUPP); 587 return htonl(NFS4ERR_NOTSUPP);
@@ -602,20 +621,18 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
602static __be32 process_op(uint32_t minorversion, int nop, 621static __be32 process_op(uint32_t minorversion, int nop,
603 struct svc_rqst *rqstp, 622 struct svc_rqst *rqstp,
604 struct xdr_stream *xdr_in, void *argp, 623 struct xdr_stream *xdr_in, void *argp,
605 struct xdr_stream *xdr_out, void *resp) 624 struct xdr_stream *xdr_out, void *resp, int* drc_status)
606{ 625{
607 struct callback_op *op = &callback_ops[0]; 626 struct callback_op *op = &callback_ops[0];
608 unsigned int op_nr = OP_CB_ILLEGAL; 627 unsigned int op_nr;
609 __be32 status; 628 __be32 status;
610 long maxlen; 629 long maxlen;
611 __be32 res; 630 __be32 res;
612 631
613 dprintk("%s: start\n", __func__); 632 dprintk("%s: start\n", __func__);
614 status = decode_op_hdr(xdr_in, &op_nr); 633 status = decode_op_hdr(xdr_in, &op_nr);
615 if (unlikely(status)) { 634 if (unlikely(status))
616 status = htonl(NFS4ERR_OP_ILLEGAL); 635 return status;
617 goto out;
618 }
619 636
620 dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", 637 dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
621 __func__, minorversion, nop, op_nr); 638 __func__, minorversion, nop, op_nr);
@@ -624,19 +641,32 @@ static __be32 process_op(uint32_t minorversion, int nop,
624 preprocess_nfs4_op(op_nr, &op); 641 preprocess_nfs4_op(op_nr, &op);
625 if (status == htonl(NFS4ERR_OP_ILLEGAL)) 642 if (status == htonl(NFS4ERR_OP_ILLEGAL))
626 op_nr = OP_CB_ILLEGAL; 643 op_nr = OP_CB_ILLEGAL;
627out: 644 if (status)
645 goto encode_hdr;
646
647 if (*drc_status) {
648 status = *drc_status;
649 goto encode_hdr;
650 }
651
628 maxlen = xdr_out->end - xdr_out->p; 652 maxlen = xdr_out->end - xdr_out->p;
629 if (maxlen > 0 && maxlen < PAGE_SIZE) { 653 if (maxlen > 0 && maxlen < PAGE_SIZE) {
630 if (likely(status == 0 && op->decode_args != NULL)) 654 status = op->decode_args(rqstp, xdr_in, argp);
631 status = op->decode_args(rqstp, xdr_in, argp); 655 if (likely(status == 0))
632 if (likely(status == 0 && op->process_op != NULL))
633 status = op->process_op(argp, resp); 656 status = op->process_op(argp, resp);
634 } else 657 } else
635 status = htonl(NFS4ERR_RESOURCE); 658 status = htonl(NFS4ERR_RESOURCE);
636 659
660 /* Only set by OP_CB_SEQUENCE processing */
661 if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) {
662 *drc_status = status;
663 status = 0;
664 }
665
666encode_hdr:
637 res = encode_op_hdr(xdr_out, op_nr, status); 667 res = encode_op_hdr(xdr_out, op_nr, status);
638 if (status == 0) 668 if (unlikely(res))
639 status = res; 669 return res;
640 if (op->encode_res != NULL && status == 0) 670 if (op->encode_res != NULL && status == 0)
641 status = op->encode_res(rqstp, xdr_out, resp); 671 status = op->encode_res(rqstp, xdr_out, resp);
642 dprintk("%s: done, status = %d\n", __func__, ntohl(status)); 672 dprintk("%s: done, status = %d\n", __func__, ntohl(status));
@@ -652,7 +682,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
652 struct cb_compound_hdr_res hdr_res = { NULL }; 682 struct cb_compound_hdr_res hdr_res = { NULL };
653 struct xdr_stream xdr_in, xdr_out; 683 struct xdr_stream xdr_in, xdr_out;
654 __be32 *p; 684 __be32 *p;
655 __be32 status; 685 __be32 status, drc_status = 0;
656 unsigned int nops = 0; 686 unsigned int nops = 0;
657 687
658 dprintk("%s: start\n", __func__); 688 dprintk("%s: start\n", __func__);
@@ -672,11 +702,18 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
672 return rpc_system_err; 702 return rpc_system_err;
673 703
674 while (status == 0 && nops != hdr_arg.nops) { 704 while (status == 0 && nops != hdr_arg.nops) {
675 status = process_op(hdr_arg.minorversion, nops, 705 status = process_op(hdr_arg.minorversion, nops, rqstp,
676 rqstp, &xdr_in, argp, &xdr_out, resp); 706 &xdr_in, argp, &xdr_out, resp, &drc_status);
677 nops++; 707 nops++;
678 } 708 }
679 709
710 /* Buffer overflow in decode_ops_hdr or encode_ops_hdr. Return
711 * resource error in cb_compound status without returning op */
712 if (unlikely(status == htonl(NFS4ERR_RESOURCE_HDR))) {
713 status = htonl(NFS4ERR_RESOURCE);
714 nops--;
715 }
716
680 *hdr_res.status = status; 717 *hdr_res.status = status;
681 *hdr_res.nops = htonl(nops); 718 *hdr_res.nops = htonl(nops);
682 dprintk("%s: done, status = %u\n", __func__, ntohl(status)); 719 dprintk("%s: done, status = %u\n", __func__, ntohl(status));
@@ -713,6 +750,11 @@ static struct callback_op callback_ops[] = {
713 .decode_args = (callback_decode_arg_t)decode_recallany_args, 750 .decode_args = (callback_decode_arg_t)decode_recallany_args,
714 .res_maxsize = CB_OP_RECALLANY_RES_MAXSZ, 751 .res_maxsize = CB_OP_RECALLANY_RES_MAXSZ,
715 }, 752 },
753 [OP_CB_RECALL_SLOT] = {
754 .process_op = (callback_process_op_t)nfs4_callback_recallslot,
755 .decode_args = (callback_decode_arg_t)decode_recallslot_args,
756 .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ,
757 },
716#endif /* CONFIG_NFS_V4_1 */ 758#endif /* CONFIG_NFS_V4_1 */
717}; 759};
718 760
@@ -741,6 +783,7 @@ struct svc_version nfs4_callback_version1 = {
741 .vs_proc = nfs4_callback_procedures1, 783 .vs_proc = nfs4_callback_procedures1,
742 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, 784 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
743 .vs_dispatch = NULL, 785 .vs_dispatch = NULL,
786 .vs_hidden = 1,
744}; 787};
745 788
746struct svc_version nfs4_callback_version4 = { 789struct svc_version nfs4_callback_version4 = {
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ee77713ce68b..a8766c4ef2e0 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -35,6 +35,7 @@
35#include <linux/vfs.h> 35#include <linux/vfs.h>
36#include <linux/inet.h> 36#include <linux/inet.h>
37#include <linux/in6.h> 37#include <linux/in6.h>
38#include <linux/slab.h>
38#include <net/ipv6.h> 39#include <net/ipv6.h>
39#include <linux/nfs_xdr.h> 40#include <linux/nfs_xdr.h>
40#include <linux/sunrpc/bc_xprt.h> 41#include <linux/sunrpc/bc_xprt.h>
@@ -164,30 +165,7 @@ error_0:
164 return ERR_PTR(err); 165 return ERR_PTR(err);
165} 166}
166 167
167static void nfs4_shutdown_client(struct nfs_client *clp)
168{
169#ifdef CONFIG_NFS_V4
170 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
171 nfs4_kill_renewd(clp);
172 BUG_ON(!RB_EMPTY_ROOT(&clp->cl_state_owners));
173 if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
174 nfs_idmap_delete(clp);
175
176 rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
177#endif
178}
179
180/*
181 * Destroy the NFS4 callback service
182 */
183static void nfs4_destroy_callback(struct nfs_client *clp)
184{
185#ifdef CONFIG_NFS_V4 168#ifdef CONFIG_NFS_V4
186 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
187 nfs_callback_down(clp->cl_minorversion);
188#endif /* CONFIG_NFS_V4 */
189}
190
191/* 169/*
192 * Clears/puts all minor version specific parts from an nfs_client struct 170 * Clears/puts all minor version specific parts from an nfs_client struct
193 * reverting it to minorversion 0. 171 * reverting it to minorversion 0.
@@ -202,9 +180,33 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
202 180
203 clp->cl_call_sync = _nfs4_call_sync; 181 clp->cl_call_sync = _nfs4_call_sync;
204#endif /* CONFIG_NFS_V4_1 */ 182#endif /* CONFIG_NFS_V4_1 */
183}
184
185/*
186 * Destroy the NFS4 callback service
187 */
188static void nfs4_destroy_callback(struct nfs_client *clp)
189{
190 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
191 nfs_callback_down(clp->cl_minorversion);
192}
205 193
194static void nfs4_shutdown_client(struct nfs_client *clp)
195{
196 if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
197 nfs4_kill_renewd(clp);
198 nfs4_clear_client_minor_version(clp);
206 nfs4_destroy_callback(clp); 199 nfs4_destroy_callback(clp);
200 if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
201 nfs_idmap_delete(clp);
202
203 rpc_destroy_wait_queue(&clp->cl_rpcwaitq);
204}
205#else
206static void nfs4_shutdown_client(struct nfs_client *clp)
207{
207} 208}
209#endif /* CONFIG_NFS_V4 */
208 210
209/* 211/*
210 * Destroy a shared client record 212 * Destroy a shared client record
@@ -213,7 +215,6 @@ static void nfs_free_client(struct nfs_client *clp)
213{ 215{
214 dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version); 216 dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version);
215 217
216 nfs4_clear_client_minor_version(clp);
217 nfs4_shutdown_client(clp); 218 nfs4_shutdown_client(clp);
218 219
219 nfs_fscache_release_client_cookie(clp); 220 nfs_fscache_release_client_cookie(clp);
@@ -1293,7 +1294,8 @@ static int nfs4_init_server(struct nfs_server *server,
1293 1294
1294 /* Initialise the client representation from the mount data */ 1295 /* Initialise the client representation from the mount data */
1295 server->flags = data->flags; 1296 server->flags = data->flags;
1296 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; 1297 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|
1298 NFS_CAP_POSIX_LOCK;
1297 server->options = data->options; 1299 server->options = data->options;
1298 1300
1299 /* Get a client record */ 1301 /* Get a client record */
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 2563bebc4c67..15671245c6ee 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -10,6 +10,7 @@
10#include <linux/kthread.h> 10#include <linux/kthread.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/sched.h> 12#include <linux/sched.h>
13#include <linux/slab.h>
13#include <linux/smp_lock.h> 14#include <linux/smp_lock.h>
14#include <linux/spinlock.h> 15#include <linux/spinlock.h>
15 16
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 944b627ec6e1..69e7b8140122 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -71,4 +71,10 @@ static inline int nfs_inode_return_delegation(struct inode *inode)
71} 71}
72#endif 72#endif
73 73
74static inline int nfs_have_delegated_attributes(struct inode *inode)
75{
76 return nfs_have_delegation(inode, FMODE_READ) &&
77 !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
78}
79
74#endif 80#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2c5ace4f00a7..be46f26c9a56 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -560,7 +560,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
560 desc->entry = &my_entry; 560 desc->entry = &my_entry;
561 561
562 nfs_block_sillyrename(dentry); 562 nfs_block_sillyrename(dentry);
563 res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping); 563 res = nfs_revalidate_mapping(inode, filp->f_mapping);
564 if (res < 0) 564 if (res < 0)
565 goto out; 565 goto out;
566 566
@@ -1025,12 +1025,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1025 res = NULL; 1025 res = NULL;
1026 goto out; 1026 goto out;
1027 /* This turned out not to be a regular file */ 1027 /* This turned out not to be a regular file */
1028 case -EISDIR:
1028 case -ENOTDIR: 1029 case -ENOTDIR:
1029 goto no_open; 1030 goto no_open;
1030 case -ELOOP: 1031 case -ELOOP:
1031 if (!(nd->intent.open.flags & O_NOFOLLOW)) 1032 if (!(nd->intent.open.flags & O_NOFOLLOW))
1032 goto no_open; 1033 goto no_open;
1033 /* case -EISDIR: */
1034 /* case -EINVAL: */ 1034 /* case -EINVAL: */
1035 default: 1035 default:
1036 goto out; 1036 goto out;
@@ -1615,6 +1615,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1615 goto out; 1615 goto out;
1616 1616
1617 new_dentry = dentry; 1617 new_dentry = dentry;
1618 rehash = NULL;
1618 new_inode = NULL; 1619 new_inode = NULL;
1619 } 1620 }
1620 } 1621 }
@@ -1788,7 +1789,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
1788 cache = nfs_access_search_rbtree(inode, cred); 1789 cache = nfs_access_search_rbtree(inode, cred);
1789 if (cache == NULL) 1790 if (cache == NULL)
1790 goto out; 1791 goto out;
1791 if (!nfs_have_delegation(inode, FMODE_READ) && 1792 if (!nfs_have_delegated_attributes(inode) &&
1792 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) 1793 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
1793 goto out_stale; 1794 goto out_stale;
1794 res->jiffies = cache->jiffies; 1795 res->jiffies = cache->jiffies;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e1d415e97849..ad4cd31d6050 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -44,6 +44,7 @@
44#include <linux/file.h> 44#include <linux/file.h>
45#include <linux/pagemap.h> 45#include <linux/pagemap.h>
46#include <linux/kref.h> 46#include <linux/kref.h>
47#include <linux/slab.h>
47 48
48#include <linux/nfs_fs.h> 49#include <linux/nfs_fs.h>
49#include <linux/nfs_page.h> 50#include <linux/nfs_page.h>
@@ -342,6 +343,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
342 data->res.fattr = &data->fattr; 343 data->res.fattr = &data->fattr;
343 data->res.eof = 0; 344 data->res.eof = 0;
344 data->res.count = bytes; 345 data->res.count = bytes;
346 nfs_fattr_init(&data->fattr);
345 msg.rpc_argp = &data->args; 347 msg.rpc_argp = &data->args;
346 msg.rpc_resp = &data->res; 348 msg.rpc_resp = &data->res;
347 349
@@ -575,6 +577,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
575 data->res.count = 0; 577 data->res.count = 0;
576 data->res.fattr = &data->fattr; 578 data->res.fattr = &data->fattr;
577 data->res.verf = &data->verf; 579 data->res.verf = &data->verf;
580 nfs_fattr_init(&data->fattr);
578 581
579 NFS_PROTO(data->inode)->commit_setup(data, &msg); 582 NFS_PROTO(data->inode)->commit_setup(data, &msg);
580 583
@@ -766,6 +769,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
766 data->res.fattr = &data->fattr; 769 data->res.fattr = &data->fattr;
767 data->res.count = bytes; 770 data->res.count = bytes;
768 data->res.verf = &data->verf; 771 data->res.verf = &data->verf;
772 nfs_fattr_init(&data->fattr);
769 773
770 task_setup_data.task = &data->task; 774 task_setup_data.task = &data->task;
771 task_setup_data.callback_data = data; 775 task_setup_data.callback_data = data;
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 95e1ca765d47..76fd235d0024 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -9,6 +9,7 @@
9#include <linux/hash.h> 9#include <linux/hash.h>
10#include <linux/string.h> 10#include <linux/string.h>
11#include <linux/kmod.h> 11#include <linux/kmod.h>
12#include <linux/slab.h>
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/socket.h> 14#include <linux/socket.h>
14#include <linux/seq_file.h> 15#include <linux/seq_file.h>
@@ -36,6 +37,19 @@ struct nfs_dns_ent {
36}; 37};
37 38
38 39
40static void nfs_dns_ent_update(struct cache_head *cnew,
41 struct cache_head *ckey)
42{
43 struct nfs_dns_ent *new;
44 struct nfs_dns_ent *key;
45
46 new = container_of(cnew, struct nfs_dns_ent, h);
47 key = container_of(ckey, struct nfs_dns_ent, h);
48
49 memcpy(&new->addr, &key->addr, key->addrlen);
50 new->addrlen = key->addrlen;
51}
52
39static void nfs_dns_ent_init(struct cache_head *cnew, 53static void nfs_dns_ent_init(struct cache_head *cnew,
40 struct cache_head *ckey) 54 struct cache_head *ckey)
41{ 55{
@@ -49,8 +63,7 @@ static void nfs_dns_ent_init(struct cache_head *cnew,
49 new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL); 63 new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
50 if (new->hostname) { 64 if (new->hostname) {
51 new->namelen = key->namelen; 65 new->namelen = key->namelen;
52 memcpy(&new->addr, &key->addr, key->addrlen); 66 nfs_dns_ent_update(cnew, ckey);
53 new->addrlen = key->addrlen;
54 } else { 67 } else {
55 new->namelen = 0; 68 new->namelen = 0;
56 new->addrlen = 0; 69 new->addrlen = 0;
@@ -234,7 +247,7 @@ static struct cache_detail nfs_dns_resolve = {
234 .cache_show = nfs_dns_show, 247 .cache_show = nfs_dns_show,
235 .match = nfs_dns_match, 248 .match = nfs_dns_match,
236 .init = nfs_dns_ent_init, 249 .init = nfs_dns_ent_init,
237 .update = nfs_dns_ent_init, 250 .update = nfs_dns_ent_update,
238 .alloc = nfs_dns_ent_alloc, 251 .alloc = nfs_dns_ent_alloc,
239}; 252};
240 253
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6b891328f332..8d965bddb87e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -24,9 +24,9 @@
24#include <linux/nfs_fs.h> 24#include <linux/nfs_fs.h>
25#include <linux/nfs_mount.h> 25#include <linux/nfs_mount.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/slab.h>
28#include <linux/pagemap.h> 27#include <linux/pagemap.h>
29#include <linux/aio.h> 28#include <linux/aio.h>
29#include <linux/gfp.h>
30 30
31#include <asm/uaccess.h> 31#include <asm/uaccess.h>
32#include <asm/system.h> 32#include <asm/system.h>
@@ -123,11 +123,11 @@ nfs_file_open(struct inode *inode, struct file *filp)
123 filp->f_path.dentry->d_parent->d_name.name, 123 filp->f_path.dentry->d_parent->d_name.name,
124 filp->f_path.dentry->d_name.name); 124 filp->f_path.dentry->d_name.name);
125 125
126 nfs_inc_stats(inode, NFSIOS_VFSOPEN);
126 res = nfs_check_flags(filp->f_flags); 127 res = nfs_check_flags(filp->f_flags);
127 if (res) 128 if (res)
128 return res; 129 return res;
129 130
130 nfs_inc_stats(inode, NFSIOS_VFSOPEN);
131 res = nfs_open(inode, filp); 131 res = nfs_open(inode, filp);
132 return res; 132 return res;
133} 133}
@@ -237,9 +237,9 @@ nfs_file_flush(struct file *file, fl_owner_t id)
237 dentry->d_parent->d_name.name, 237 dentry->d_parent->d_name.name,
238 dentry->d_name.name); 238 dentry->d_name.name);
239 239
240 nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
240 if ((file->f_mode & FMODE_WRITE) == 0) 241 if ((file->f_mode & FMODE_WRITE) == 0)
241 return 0; 242 return 0;
242 nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
243 243
244 /* Flush writes to the server and return any errors */ 244 /* Flush writes to the server and return any errors */
245 return nfs_do_fsync(ctx, inode); 245 return nfs_do_fsync(ctx, inode);
@@ -262,9 +262,11 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
262 (unsigned long) count, (unsigned long) pos); 262 (unsigned long) count, (unsigned long) pos);
263 263
264 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); 264 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
265 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count); 265 if (!result) {
266 if (!result)
267 result = generic_file_aio_read(iocb, iov, nr_segs, pos); 266 result = generic_file_aio_read(iocb, iov, nr_segs, pos);
267 if (result > 0)
268 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
269 }
268 return result; 270 return result;
269} 271}
270 272
@@ -282,8 +284,11 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
282 (unsigned long) count, (unsigned long long) *ppos); 284 (unsigned long) count, (unsigned long long) *ppos);
283 285
284 res = nfs_revalidate_mapping(inode, filp->f_mapping); 286 res = nfs_revalidate_mapping(inode, filp->f_mapping);
285 if (!res) 287 if (!res) {
286 res = generic_file_splice_read(filp, ppos, pipe, count, flags); 288 res = generic_file_splice_read(filp, ppos, pipe, count, flags);
289 if (res > 0)
290 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
291 }
287 return res; 292 return res;
288} 293}
289 294
@@ -486,6 +491,9 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
486{ 491{
487 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 492 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
488 493
494 /* Only do I/O if gfp is a superset of GFP_KERNEL */
495 if ((gfp & GFP_KERNEL) == GFP_KERNEL)
496 nfs_wb_page(page->mapping->host, page);
489 /* If PagePrivate() is set, then the page is not freeable */ 497 /* If PagePrivate() is set, then the page is not freeable */
490 if (PagePrivate(page)) 498 if (PagePrivate(page))
491 return 0; 499 return 0;
@@ -594,6 +602,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
594{ 602{
595 struct dentry * dentry = iocb->ki_filp->f_path.dentry; 603 struct dentry * dentry = iocb->ki_filp->f_path.dentry;
596 struct inode * inode = dentry->d_inode; 604 struct inode * inode = dentry->d_inode;
605 unsigned long written = 0;
597 ssize_t result; 606 ssize_t result;
598 size_t count = iov_length(iov, nr_segs); 607 size_t count = iov_length(iov, nr_segs);
599 608
@@ -620,14 +629,18 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
620 if (!count) 629 if (!count)
621 goto out; 630 goto out;
622 631
623 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
624 result = generic_file_aio_write(iocb, iov, nr_segs, pos); 632 result = generic_file_aio_write(iocb, iov, nr_segs, pos);
633 if (result > 0)
634 written = result;
635
625 /* Return error values for O_DSYNC and IS_SYNC() */ 636 /* Return error values for O_DSYNC and IS_SYNC() */
626 if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { 637 if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
627 int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); 638 int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
628 if (err < 0) 639 if (err < 0)
629 result = err; 640 result = err;
630 } 641 }
642 if (result > 0)
643 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
631out: 644out:
632 return result; 645 return result;
633 646
@@ -642,6 +655,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
642{ 655{
643 struct dentry *dentry = filp->f_path.dentry; 656 struct dentry *dentry = filp->f_path.dentry;
644 struct inode *inode = dentry->d_inode; 657 struct inode *inode = dentry->d_inode;
658 unsigned long written = 0;
645 ssize_t ret; 659 ssize_t ret;
646 660
647 dprintk("NFS splice_write(%s/%s, %lu@%llu)\n", 661 dprintk("NFS splice_write(%s/%s, %lu@%llu)\n",
@@ -652,14 +666,17 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
652 * The combination of splice and an O_APPEND destination is disallowed. 666 * The combination of splice and an O_APPEND destination is disallowed.
653 */ 667 */
654 668
655 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
656
657 ret = generic_file_splice_write(pipe, filp, ppos, count, flags); 669 ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
670 if (ret > 0)
671 written = ret;
672
658 if (ret >= 0 && nfs_need_sync_write(filp, inode)) { 673 if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
659 int err = nfs_do_fsync(nfs_file_open_context(filp), inode); 674 int err = nfs_do_fsync(nfs_file_open_context(filp), inode);
660 if (err < 0) 675 if (err < 0)
661 ret = err; 676 ret = err;
662 } 677 }
678 if (ret > 0)
679 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
663 return ret; 680 return ret;
664} 681}
665 682
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index fa588006588d..a6b16ed93229 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -17,6 +17,7 @@
17#include <linux/nfs_fs_sb.h> 17#include <linux/nfs_fs_sb.h>
18#include <linux/in6.h> 18#include <linux/in6.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/slab.h>
20 21
21#include "internal.h" 22#include "internal.h"
22#include "iostat.h" 23#include "iostat.h"
@@ -354,12 +355,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)
354 */ 355 */
355int nfs_fscache_release_page(struct page *page, gfp_t gfp) 356int nfs_fscache_release_page(struct page *page, gfp_t gfp)
356{ 357{
357 struct nfs_inode *nfsi = NFS_I(page->mapping->host);
358 struct fscache_cookie *cookie = nfsi->fscache;
359
360 BUG_ON(!cookie);
361
362 if (PageFsCache(page)) { 358 if (PageFsCache(page)) {
359 struct nfs_inode *nfsi = NFS_I(page->mapping->host);
360 struct fscache_cookie *cookie = nfsi->fscache;
361
362 BUG_ON(!cookie);
363 dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", 363 dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
364 cookie, page, nfsi); 364 cookie, page, nfsi);
365 365
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index faa091865ad0..50a56edca0b5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -36,6 +36,7 @@
36#include <linux/vfs.h> 36#include <linux/vfs.h>
37#include <linux/inet.h> 37#include <linux/inet.h>
38#include <linux/nfs_xdr.h> 38#include <linux/nfs_xdr.h>
39#include <linux/slab.h>
39 40
40#include <asm/system.h> 41#include <asm/system.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
@@ -97,22 +98,6 @@ u64 nfs_compat_user_ino64(u64 fileid)
97 return ino; 98 return ino;
98} 99}
99 100
100int nfs_write_inode(struct inode *inode, int sync)
101{
102 int ret;
103
104 if (sync) {
105 ret = filemap_fdatawait(inode->i_mapping);
106 if (ret == 0)
107 ret = nfs_commit_inode(inode, FLUSH_SYNC);
108 } else
109 ret = nfs_commit_inode(inode, 0);
110 if (ret >= 0)
111 return 0;
112 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
113 return ret;
114}
115
116void nfs_clear_inode(struct inode *inode) 101void nfs_clear_inode(struct inode *inode)
117{ 102{
118 /* 103 /*
@@ -130,16 +115,12 @@ void nfs_clear_inode(struct inode *inode)
130 */ 115 */
131int nfs_sync_mapping(struct address_space *mapping) 116int nfs_sync_mapping(struct address_space *mapping)
132{ 117{
133 int ret; 118 int ret = 0;
134 119
135 if (mapping->nrpages == 0) 120 if (mapping->nrpages != 0) {
136 return 0; 121 unmap_mapping_range(mapping, 0, 0, 0);
137 unmap_mapping_range(mapping, 0, 0, 0); 122 ret = nfs_wb_all(mapping->host);
138 ret = filemap_write_and_wait(mapping); 123 }
139 if (ret != 0)
140 goto out;
141 ret = nfs_wb_all(mapping->host);
142out:
143 return ret; 124 return ret;
144} 125}
145 126
@@ -511,17 +492,11 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
511 int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; 492 int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
512 int err; 493 int err;
513 494
514 /* 495 /* Flush out writes to the server in order to update c/mtime. */
515 * Flush out writes to the server in order to update c/mtime.
516 *
517 * Hold the i_mutex to suspend application writes temporarily;
518 * this prevents long-running writing applications from blocking
519 * nfs_wb_nocommit.
520 */
521 if (S_ISREG(inode->i_mode)) { 496 if (S_ISREG(inode->i_mode)) {
522 mutex_lock(&inode->i_mutex); 497 err = filemap_write_and_wait(inode->i_mapping);
523 nfs_wb_nocommit(inode); 498 if (err)
524 mutex_unlock(&inode->i_mutex); 499 goto out;
525 } 500 }
526 501
527 /* 502 /*
@@ -545,6 +520,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
545 generic_fillattr(inode, stat); 520 generic_fillattr(inode, stat);
546 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); 521 stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
547 } 522 }
523out:
548 return err; 524 return err;
549} 525}
550 526
@@ -574,14 +550,14 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
574 nfs_revalidate_inode(server, inode); 550 nfs_revalidate_inode(server, inode);
575} 551}
576 552
577static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred) 553static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred)
578{ 554{
579 struct nfs_open_context *ctx; 555 struct nfs_open_context *ctx;
580 556
581 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 557 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
582 if (ctx != NULL) { 558 if (ctx != NULL) {
583 ctx->path.dentry = dget(dentry); 559 ctx->path = *path;
584 ctx->path.mnt = mntget(mnt); 560 path_get(&ctx->path);
585 ctx->cred = get_rpccred(cred); 561 ctx->cred = get_rpccred(cred);
586 ctx->state = NULL; 562 ctx->state = NULL;
587 ctx->lockowner = current->files; 563 ctx->lockowner = current->files;
@@ -620,11 +596,6 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
620 __put_nfs_open_context(ctx, 0); 596 __put_nfs_open_context(ctx, 0);
621} 597}
622 598
623static void put_nfs_open_context_sync(struct nfs_open_context *ctx)
624{
625 __put_nfs_open_context(ctx, 1);
626}
627
628/* 599/*
629 * Ensure that mmap has a recent RPC credential for use when writing out 600 * Ensure that mmap has a recent RPC credential for use when writing out
630 * shared pages 601 * shared pages
@@ -652,10 +623,10 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
652 list_for_each_entry(pos, &nfsi->open_files, list) { 623 list_for_each_entry(pos, &nfsi->open_files, list) {
653 if (cred != NULL && pos->cred != cred) 624 if (cred != NULL && pos->cred != cred)
654 continue; 625 continue;
655 if ((pos->mode & mode) == mode) { 626 if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
656 ctx = get_nfs_open_context(pos); 627 continue;
657 break; 628 ctx = get_nfs_open_context(pos);
658 } 629 break;
659 } 630 }
660 spin_unlock(&inode->i_lock); 631 spin_unlock(&inode->i_lock);
661 return ctx; 632 return ctx;
@@ -671,7 +642,7 @@ static void nfs_file_clear_open_context(struct file *filp)
671 spin_lock(&inode->i_lock); 642 spin_lock(&inode->i_lock);
672 list_move_tail(&ctx->list, &NFS_I(inode)->open_files); 643 list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
673 spin_unlock(&inode->i_lock); 644 spin_unlock(&inode->i_lock);
674 put_nfs_open_context_sync(ctx); 645 __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1);
675 } 646 }
676} 647}
677 648
@@ -686,7 +657,7 @@ int nfs_open(struct inode *inode, struct file *filp)
686 cred = rpc_lookup_cred(); 657 cred = rpc_lookup_cred();
687 if (IS_ERR(cred)) 658 if (IS_ERR(cred))
688 return PTR_ERR(cred); 659 return PTR_ERR(cred);
689 ctx = alloc_nfs_open_context(filp->f_path.mnt, filp->f_path.dentry, cred); 660 ctx = alloc_nfs_open_context(&filp->f_path, cred);
690 put_rpccred(cred); 661 put_rpccred(cred);
691 if (ctx == NULL) 662 if (ctx == NULL)
692 return -ENOMEM; 663 return -ENOMEM;
@@ -759,7 +730,7 @@ int nfs_attribute_timeout(struct inode *inode)
759{ 730{
760 struct nfs_inode *nfsi = NFS_I(inode); 731 struct nfs_inode *nfsi = NFS_I(inode);
761 732
762 if (nfs_have_delegation(inode, FMODE_READ)) 733 if (nfs_have_delegated_attributes(inode))
763 return 0; 734 return 0;
764 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); 735 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
765} 736}
@@ -779,7 +750,7 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
779 return __nfs_revalidate_inode(server, inode); 750 return __nfs_revalidate_inode(server, inode);
780} 751}
781 752
782static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_space *mapping) 753static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
783{ 754{
784 struct nfs_inode *nfsi = NFS_I(inode); 755 struct nfs_inode *nfsi = NFS_I(inode);
785 756
@@ -800,49 +771,10 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
800 return 0; 771 return 0;
801} 772}
802 773
803static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
804{
805 int ret = 0;
806
807 mutex_lock(&inode->i_mutex);
808 if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_DATA) {
809 ret = nfs_sync_mapping(mapping);
810 if (ret == 0)
811 ret = nfs_invalidate_mapping_nolock(inode, mapping);
812 }
813 mutex_unlock(&inode->i_mutex);
814 return ret;
815}
816
817/**
818 * nfs_revalidate_mapping_nolock - Revalidate the pagecache
819 * @inode - pointer to host inode
820 * @mapping - pointer to mapping
821 */
822int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping)
823{
824 struct nfs_inode *nfsi = NFS_I(inode);
825 int ret = 0;
826
827 if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
828 || nfs_attribute_timeout(inode) || NFS_STALE(inode)) {
829 ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
830 if (ret < 0)
831 goto out;
832 }
833 if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
834 ret = nfs_invalidate_mapping_nolock(inode, mapping);
835out:
836 return ret;
837}
838
839/** 774/**
840 * nfs_revalidate_mapping - Revalidate the pagecache 775 * nfs_revalidate_mapping - Revalidate the pagecache
841 * @inode - pointer to host inode 776 * @inode - pointer to host inode
842 * @mapping - pointer to mapping 777 * @mapping - pointer to mapping
843 *
844 * This version of the function will take the inode->i_mutex and attempt to
845 * flush out all dirty data if it needs to invalidate the page cache.
846 */ 778 */
847int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) 779int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
848{ 780{
@@ -1261,8 +1193,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1261 1193
1262 if (fattr->valid & NFS_ATTR_FATTR_MODE) { 1194 if (fattr->valid & NFS_ATTR_FATTR_MODE) {
1263 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { 1195 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
1196 umode_t newmode = inode->i_mode & S_IFMT;
1197 newmode |= fattr->mode & S_IALLUGO;
1198 inode->i_mode = newmode;
1264 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1199 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1265 inode->i_mode = fattr->mode;
1266 } 1200 }
1267 } else if (server->caps & NFS_CAP_MODE) 1201 } else if (server->caps & NFS_CAP_MODE)
1268 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR 1202 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
@@ -1418,6 +1352,7 @@ static void init_once(void *foo)
1418 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); 1352 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
1419 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); 1353 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
1420 nfsi->npages = 0; 1354 nfsi->npages = 0;
1355 nfsi->ncommit = 0;
1421 atomic_set(&nfsi->silly_count, 1); 1356 atomic_set(&nfsi->silly_count, 1);
1422 INIT_HLIST_HEAD(&nfsi->silly_list); 1357 INIT_HLIST_HEAD(&nfsi->silly_list);
1423 init_waitqueue_head(&nfsi->waitqueue); 1358 init_waitqueue_head(&nfsi->waitqueue);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 29e464d23b32..11f82f03c5de 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -211,7 +211,7 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
211extern struct workqueue_struct *nfsiod_workqueue; 211extern struct workqueue_struct *nfsiod_workqueue;
212extern struct inode *nfs_alloc_inode(struct super_block *sb); 212extern struct inode *nfs_alloc_inode(struct super_block *sb);
213extern void nfs_destroy_inode(struct inode *); 213extern void nfs_destroy_inode(struct inode *);
214extern int nfs_write_inode(struct inode *,int); 214extern int nfs_write_inode(struct inode *, struct writeback_control *);
215extern void nfs_clear_inode(struct inode *); 215extern void nfs_clear_inode(struct inode *);
216#ifdef CONFIG_NFS_V4 216#ifdef CONFIG_NFS_V4
217extern void nfs4_clear_inode(struct inode *); 217extern void nfs4_clear_inode(struct inode *);
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 46d779abafd3..1d8d5c813b01 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -57,12 +57,12 @@ static inline void nfs_add_fscache_stats(struct inode *inode,
57} 57}
58#endif 58#endif
59 59
60static inline struct nfs_iostats *nfs_alloc_iostats(void) 60static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
61{ 61{
62 return alloc_percpu(struct nfs_iostats); 62 return alloc_percpu(struct nfs_iostats);
63} 63}
64 64
65static inline void nfs_free_iostats(struct nfs_iostats *stats) 65static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats)
66{ 66{
67 if (stats != NULL) 67 if (stats != NULL)
68 free_percpu(stats); 68 free_percpu(stats);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 0adefc40cc89..59047f8d7d72 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -120,7 +120,7 @@ static struct {
120 { .status = MNT3ERR_INVAL, .errno = -EINVAL, }, 120 { .status = MNT3ERR_INVAL, .errno = -EINVAL, },
121 { .status = MNT3ERR_NAMETOOLONG, .errno = -ENAMETOOLONG, }, 121 { .status = MNT3ERR_NAMETOOLONG, .errno = -ENAMETOOLONG, },
122 { .status = MNT3ERR_NOTSUPP, .errno = -ENOTSUPP, }, 122 { .status = MNT3ERR_NOTSUPP, .errno = -ENOTSUPP, },
123 { .status = MNT3ERR_SERVERFAULT, .errno = -ESERVERFAULT, }, 123 { .status = MNT3ERR_SERVERFAULT, .errno = -EREMOTEIO, },
124}; 124};
125 125
126struct mountres { 126struct mountres {
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 40c766782891..7888cf36022d 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include <linux/dcache.h> 10#include <linux/dcache.h>
11#include <linux/gfp.h>
11#include <linux/mount.h> 12#include <linux/mount.h>
12#include <linux/namei.h> 13#include <linux/namei.h>
13#include <linux/nfs_fs.h> 14#include <linux/nfs_fs.h>
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5e078b222b4e..81cf14257916 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -12,7 +12,6 @@
12#include <linux/param.h> 12#include <linux/param.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/slab.h>
16#include <linux/errno.h> 15#include <linux/errno.h>
17#include <linux/string.h> 16#include <linux/string.h>
18#include <linux/in.h> 17#include <linux/in.h>
@@ -699,7 +698,7 @@ static struct {
699 { NFSERR_BAD_COOKIE, -EBADCOOKIE }, 698 { NFSERR_BAD_COOKIE, -EBADCOOKIE },
700 { NFSERR_NOTSUPP, -ENOTSUPP }, 699 { NFSERR_NOTSUPP, -ENOTSUPP },
701 { NFSERR_TOOSMALL, -ETOOSMALL }, 700 { NFSERR_TOOSMALL, -ETOOSMALL },
702 { NFSERR_SERVERFAULT, -ESERVERFAULT }, 701 { NFSERR_SERVERFAULT, -EREMOTEIO },
703 { NFSERR_BADTYPE, -EBADTYPE }, 702 { NFSERR_BADTYPE, -EBADTYPE },
704 { NFSERR_JUKEBOX, -EJUKEBOX }, 703 { NFSERR_JUKEBOX, -EJUKEBOX },
705 { -1, -EIO } 704 { -1, -EIO }
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index bac60515a4b3..d150ae0c5ecd 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -1,4 +1,5 @@
1#include <linux/fs.h> 1#include <linux/fs.h>
2#include <linux/gfp.h>
2#include <linux/nfs.h> 3#include <linux/nfs.h>
3#include <linux/nfs3.h> 4#include <linux/nfs3.h>
4#include <linux/nfs_fs.h> 5#include <linux/nfs_fs.h>
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 3f8881d1a050..e701002694e5 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -10,6 +10,7 @@
10#include <linux/errno.h> 10#include <linux/errno.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/sunrpc/clnt.h> 12#include <linux/sunrpc/clnt.h>
13#include <linux/slab.h>
13#include <linux/nfs.h> 14#include <linux/nfs.h>
14#include <linux/nfs3.h> 15#include <linux/nfs3.h>
15#include <linux/nfs_fs.h> 16#include <linux/nfs_fs.h>
@@ -22,14 +23,14 @@
22 23
23#define NFSDBG_FACILITY NFSDBG_PROC 24#define NFSDBG_FACILITY NFSDBG_PROC
24 25
25/* A wrapper to handle the EJUKEBOX error message */ 26/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */
26static int 27static int
27nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) 28nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
28{ 29{
29 int res; 30 int res;
30 do { 31 do {
31 res = rpc_call_sync(clnt, msg, flags); 32 res = rpc_call_sync(clnt, msg, flags);
32 if (res != -EJUKEBOX) 33 if (res != -EJUKEBOX && res != -EKEYEXPIRED)
33 break; 34 break;
34 schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); 35 schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
35 res = -ERESTARTSYS; 36 res = -ERESTARTSYS;
@@ -42,9 +43,10 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
42static int 43static int
43nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode) 44nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
44{ 45{
45 if (task->tk_status != -EJUKEBOX) 46 if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED)
46 return 0; 47 return 0;
47 nfs_inc_stats(inode, NFSIOS_DELAY); 48 if (task->tk_status == -EJUKEBOX)
49 nfs_inc_stats(inode, NFSIOS_DELAY);
48 task->tk_status = 0; 50 task->tk_status = 0;
49 rpc_restart_call(task); 51 rpc_restart_call(task);
50 rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); 52 rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 5fe5492fbd29..56a86f6ac8b5 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -9,7 +9,6 @@
9#include <linux/param.h> 9#include <linux/param.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/slab.h>
13#include <linux/errno.h> 12#include <linux/errno.h>
14#include <linux/string.h> 13#include <linux/string.h>
15#include <linux/in.h> 14#include <linux/in.h>
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 865265bdca03..a187200a7aac 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -46,6 +46,7 @@ enum nfs4_client_state {
46 NFS4CLNT_DELEGRETURN, 46 NFS4CLNT_DELEGRETURN,
47 NFS4CLNT_SESSION_RESET, 47 NFS4CLNT_SESSION_RESET,
48 NFS4CLNT_SESSION_DRAINING, 48 NFS4CLNT_SESSION_DRAINING,
49 NFS4CLNT_RECALL_SLOT,
49}; 50};
50 51
51/* 52/*
@@ -146,6 +147,7 @@ enum {
146 NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */ 147 NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */
147 NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */ 148 NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */
148 NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */ 149 NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */
150 NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */
149}; 151};
150 152
151struct nfs4_state { 153struct nfs4_state {
@@ -277,7 +279,9 @@ extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
277extern void nfs4_schedule_state_recovery(struct nfs_client *); 279extern void nfs4_schedule_state_recovery(struct nfs_client *);
278extern void nfs4_schedule_state_manager(struct nfs_client *); 280extern void nfs4_schedule_state_manager(struct nfs_client *);
279extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state); 281extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
282extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state);
280extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); 283extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
284extern void nfs41_handle_recall_slot(struct nfs_client *clp);
281extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 285extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
282extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); 286extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
283extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); 287extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index fa3408f20112..f071d12c613b 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -11,6 +11,7 @@
11#include <linux/mount.h> 11#include <linux/mount.h>
12#include <linux/namei.h> 12#include <linux/namei.h>
13#include <linux/nfs_fs.h> 13#include <linux/nfs_fs.h>
14#include <linux/slab.h>
14#include <linux/string.h> 15#include <linux/string.h>
15#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
16#include <linux/vfs.h> 17#include <linux/vfs.h>
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 198d51d17c13..638067007c65 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -39,6 +39,7 @@
39#include <linux/delay.h> 39#include <linux/delay.h>
40#include <linux/errno.h> 40#include <linux/errno.h>
41#include <linux/string.h> 41#include <linux/string.h>
42#include <linux/slab.h>
42#include <linux/sunrpc/clnt.h> 43#include <linux/sunrpc/clnt.h>
43#include <linux/nfs.h> 44#include <linux/nfs.h>
44#include <linux/nfs4.h> 45#include <linux/nfs4.h>
@@ -249,19 +250,15 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
249 if (state == NULL) 250 if (state == NULL)
250 break; 251 break;
251 nfs4_state_mark_reclaim_nograce(clp, state); 252 nfs4_state_mark_reclaim_nograce(clp, state);
252 case -NFS4ERR_STALE_CLIENTID: 253 goto do_state_recovery;
253 case -NFS4ERR_STALE_STATEID: 254 case -NFS4ERR_STALE_STATEID:
254 case -NFS4ERR_EXPIRED: 255 if (state == NULL)
255 nfs4_schedule_state_recovery(clp);
256 ret = nfs4_wait_clnt_recover(clp);
257 if (ret == 0)
258 exception->retry = 1;
259#if !defined(CONFIG_NFS_V4_1)
260 break;
261#else /* !defined(CONFIG_NFS_V4_1) */
262 if (!nfs4_has_session(server->nfs_client))
263 break; 256 break;
264 /* FALLTHROUGH */ 257 nfs4_state_mark_reclaim_reboot(clp, state);
258 case -NFS4ERR_STALE_CLIENTID:
259 case -NFS4ERR_EXPIRED:
260 goto do_state_recovery;
261#if defined(CONFIG_NFS_V4_1)
265 case -NFS4ERR_BADSESSION: 262 case -NFS4ERR_BADSESSION:
266 case -NFS4ERR_BADSLOT: 263 case -NFS4ERR_BADSLOT:
267 case -NFS4ERR_BAD_HIGH_SLOT: 264 case -NFS4ERR_BAD_HIGH_SLOT:
@@ -274,7 +271,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
274 nfs4_schedule_state_recovery(clp); 271 nfs4_schedule_state_recovery(clp);
275 exception->retry = 1; 272 exception->retry = 1;
276 break; 273 break;
277#endif /* !defined(CONFIG_NFS_V4_1) */ 274#endif /* defined(CONFIG_NFS_V4_1) */
278 case -NFS4ERR_FILE_OPEN: 275 case -NFS4ERR_FILE_OPEN:
279 if (exception->timeout > HZ) { 276 if (exception->timeout > HZ) {
280 /* We have retried a decent amount, time to 277 /* We have retried a decent amount, time to
@@ -285,6 +282,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
285 } 282 }
286 case -NFS4ERR_GRACE: 283 case -NFS4ERR_GRACE:
287 case -NFS4ERR_DELAY: 284 case -NFS4ERR_DELAY:
285 case -EKEYEXPIRED:
288 ret = nfs4_delay(server->client, &exception->timeout); 286 ret = nfs4_delay(server->client, &exception->timeout);
289 if (ret != 0) 287 if (ret != 0)
290 break; 288 break;
@@ -293,6 +291,12 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
293 } 291 }
294 /* We failed to handle the error */ 292 /* We failed to handle the error */
295 return nfs4_map_errors(ret); 293 return nfs4_map_errors(ret);
294do_state_recovery:
295 nfs4_schedule_state_recovery(clp);
296 ret = nfs4_wait_clnt_recover(clp);
297 if (ret == 0)
298 exception->retry = 1;
299 return ret;
296} 300}
297 301
298 302
@@ -416,7 +420,8 @@ static void nfs41_sequence_done(struct nfs_client *clp,
416 clp->cl_last_renewal = timestamp; 420 clp->cl_last_renewal = timestamp;
417 spin_unlock(&clp->cl_lock); 421 spin_unlock(&clp->cl_lock);
418 /* Check sequence flags */ 422 /* Check sequence flags */
419 nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); 423 if (atomic_read(&clp->cl_count) > 1)
424 nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
420 } 425 }
421out: 426out:
422 /* The session may be reset by one of the error handlers. */ 427 /* The session may be reset by one of the error handlers. */
@@ -722,8 +727,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
722 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); 727 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
723 if (p->o_arg.seqid == NULL) 728 if (p->o_arg.seqid == NULL)
724 goto err_free; 729 goto err_free;
725 p->path.mnt = mntget(path->mnt); 730 path_get(path);
726 p->path.dentry = dget(path->dentry); 731 p->path = *path;
727 p->dir = parent; 732 p->dir = parent;
728 p->owner = sp; 733 p->owner = sp;
729 atomic_inc(&sp->so_count); 734 atomic_inc(&sp->so_count);
@@ -1161,7 +1166,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
1161 int err; 1166 int err;
1162 do { 1167 do {
1163 err = _nfs4_do_open_reclaim(ctx, state); 1168 err = _nfs4_do_open_reclaim(ctx, state);
1164 if (err != -NFS4ERR_DELAY) 1169 if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
1165 break; 1170 break;
1166 nfs4_handle_exception(server, err, &exception); 1171 nfs4_handle_exception(server, err, &exception);
1167 } while (exception.retry); 1172 } while (exception.retry);
@@ -1518,6 +1523,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1518 nfs_post_op_update_inode(dir, o_res->dir_attr); 1523 nfs_post_op_update_inode(dir, o_res->dir_attr);
1519 } else 1524 } else
1520 nfs_refresh_inode(dir, o_res->dir_attr); 1525 nfs_refresh_inode(dir, o_res->dir_attr);
1526 if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
1527 server->caps &= ~NFS_CAP_POSIX_LOCK;
1521 if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { 1528 if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
1522 status = _nfs4_proc_open_confirm(data); 1529 status = _nfs4_proc_open_confirm(data);
1523 if (status != 0) 1530 if (status != 0)
@@ -1580,6 +1587,7 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
1580 goto out; 1587 goto out;
1581 case -NFS4ERR_GRACE: 1588 case -NFS4ERR_GRACE:
1582 case -NFS4ERR_DELAY: 1589 case -NFS4ERR_DELAY:
1590 case -EKEYEXPIRED:
1583 nfs4_handle_exception(server, err, &exception); 1591 nfs4_handle_exception(server, err, &exception);
1584 err = 0; 1592 err = 0;
1585 } 1593 }
@@ -1658,6 +1666,8 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
1658 status = PTR_ERR(state); 1666 status = PTR_ERR(state);
1659 if (IS_ERR(state)) 1667 if (IS_ERR(state))
1660 goto err_opendata_put; 1668 goto err_opendata_put;
1669 if (server->caps & NFS_CAP_POSIX_LOCK)
1670 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
1661 nfs4_opendata_put(opendata); 1671 nfs4_opendata_put(opendata);
1662 nfs4_put_state_owner(sp); 1672 nfs4_put_state_owner(sp);
1663 *res = state; 1673 *res = state;
@@ -1940,8 +1950,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
1940 calldata->res.seqid = calldata->arg.seqid; 1950 calldata->res.seqid = calldata->arg.seqid;
1941 calldata->res.server = server; 1951 calldata->res.server = server;
1942 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; 1952 calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
1943 calldata->path.mnt = mntget(path->mnt); 1953 path_get(path);
1944 calldata->path.dentry = dget(path->dentry); 1954 calldata->path = *path;
1945 1955
1946 msg.rpc_argp = &calldata->arg, 1956 msg.rpc_argp = &calldata->arg,
1947 msg.rpc_resp = &calldata->res, 1957 msg.rpc_resp = &calldata->res,
@@ -2060,8 +2070,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
2060 case -EDQUOT: 2070 case -EDQUOT:
2061 case -ENOSPC: 2071 case -ENOSPC:
2062 case -EROFS: 2072 case -EROFS:
2063 lookup_instantiate_filp(nd, (struct dentry *)state, NULL); 2073 return PTR_ERR(state);
2064 return 1;
2065 default: 2074 default:
2066 goto out_drop; 2075 goto out_drop;
2067 } 2076 }
@@ -3141,10 +3150,19 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
3141 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special 3150 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
3142 * standalone procedure for queueing an asynchronous RENEW. 3151 * standalone procedure for queueing an asynchronous RENEW.
3143 */ 3152 */
3153static void nfs4_renew_release(void *data)
3154{
3155 struct nfs_client *clp = data;
3156
3157 if (atomic_read(&clp->cl_count) > 1)
3158 nfs4_schedule_state_renewal(clp);
3159 nfs_put_client(clp);
3160}
3161
3144static void nfs4_renew_done(struct rpc_task *task, void *data) 3162static void nfs4_renew_done(struct rpc_task *task, void *data)
3145{ 3163{
3146 struct nfs_client *clp = (struct nfs_client *)task->tk_msg.rpc_argp; 3164 struct nfs_client *clp = data;
3147 unsigned long timestamp = (unsigned long)data; 3165 unsigned long timestamp = task->tk_start;
3148 3166
3149 if (task->tk_status < 0) { 3167 if (task->tk_status < 0) {
3150 /* Unless we're shutting down, schedule state recovery! */ 3168 /* Unless we're shutting down, schedule state recovery! */
@@ -3160,6 +3178,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
3160 3178
3161static const struct rpc_call_ops nfs4_renew_ops = { 3179static const struct rpc_call_ops nfs4_renew_ops = {
3162 .rpc_call_done = nfs4_renew_done, 3180 .rpc_call_done = nfs4_renew_done,
3181 .rpc_release = nfs4_renew_release,
3163}; 3182};
3164 3183
3165int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) 3184int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3170,8 +3189,10 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
3170 .rpc_cred = cred, 3189 .rpc_cred = cred,
3171 }; 3190 };
3172 3191
3192 if (!atomic_inc_not_zero(&clp->cl_count))
3193 return -EIO;
3173 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, 3194 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
3174 &nfs4_renew_ops, (void *)jiffies); 3195 &nfs4_renew_ops, clp);
3175} 3196}
3176 3197
3177int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) 3198int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3422,15 +3443,14 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3422 if (state == NULL) 3443 if (state == NULL)
3423 break; 3444 break;
3424 nfs4_state_mark_reclaim_nograce(clp, state); 3445 nfs4_state_mark_reclaim_nograce(clp, state);
3425 case -NFS4ERR_STALE_CLIENTID: 3446 goto do_state_recovery;
3426 case -NFS4ERR_STALE_STATEID: 3447 case -NFS4ERR_STALE_STATEID:
3448 if (state == NULL)
3449 break;
3450 nfs4_state_mark_reclaim_reboot(clp, state);
3451 case -NFS4ERR_STALE_CLIENTID:
3427 case -NFS4ERR_EXPIRED: 3452 case -NFS4ERR_EXPIRED:
3428 rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); 3453 goto do_state_recovery;
3429 nfs4_schedule_state_recovery(clp);
3430 if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
3431 rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
3432 task->tk_status = 0;
3433 return -EAGAIN;
3434#if defined(CONFIG_NFS_V4_1) 3454#if defined(CONFIG_NFS_V4_1)
3435 case -NFS4ERR_BADSESSION: 3455 case -NFS4ERR_BADSESSION:
3436 case -NFS4ERR_BADSLOT: 3456 case -NFS4ERR_BADSLOT:
@@ -3449,6 +3469,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3449 if (server) 3469 if (server)
3450 nfs_inc_server_stats(server, NFSIOS_DELAY); 3470 nfs_inc_server_stats(server, NFSIOS_DELAY);
3451 case -NFS4ERR_GRACE: 3471 case -NFS4ERR_GRACE:
3472 case -EKEYEXPIRED:
3452 rpc_delay(task, NFS4_POLL_RETRY_MAX); 3473 rpc_delay(task, NFS4_POLL_RETRY_MAX);
3453 task->tk_status = 0; 3474 task->tk_status = 0;
3454 return -EAGAIN; 3475 return -EAGAIN;
@@ -3458,6 +3479,13 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3458 } 3479 }
3459 task->tk_status = nfs4_map_errors(task->tk_status); 3480 task->tk_status = nfs4_map_errors(task->tk_status);
3460 return 0; 3481 return 0;
3482do_state_recovery:
3483 rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
3484 nfs4_schedule_state_recovery(clp);
3485 if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
3486 rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
3487 task->tk_status = 0;
3488 return -EAGAIN;
3461} 3489}
3462 3490
3463static int 3491static int
@@ -3554,6 +3582,7 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
3554 case -NFS4ERR_RESOURCE: 3582 case -NFS4ERR_RESOURCE:
3555 /* The IBM lawyers misread another document! */ 3583 /* The IBM lawyers misread another document! */
3556 case -NFS4ERR_DELAY: 3584 case -NFS4ERR_DELAY:
3585 case -EKEYEXPIRED:
3557 err = nfs4_delay(clp->cl_rpcclient, &timeout); 3586 err = nfs4_delay(clp->cl_rpcclient, &timeout);
3558 } 3587 }
3559 } while (err == 0); 3588 } while (err == 0);
@@ -4088,6 +4117,28 @@ static const struct rpc_call_ops nfs4_recover_lock_ops = {
4088 .rpc_release = nfs4_lock_release, 4117 .rpc_release = nfs4_lock_release,
4089}; 4118};
4090 4119
4120static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
4121{
4122 struct nfs_client *clp = server->nfs_client;
4123 struct nfs4_state *state = lsp->ls_state;
4124
4125 switch (error) {
4126 case -NFS4ERR_ADMIN_REVOKED:
4127 case -NFS4ERR_BAD_STATEID:
4128 case -NFS4ERR_EXPIRED:
4129 if (new_lock_owner != 0 ||
4130 (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
4131 nfs4_state_mark_reclaim_nograce(clp, state);
4132 lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
4133 break;
4134 case -NFS4ERR_STALE_STATEID:
4135 if (new_lock_owner != 0 ||
4136 (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
4137 nfs4_state_mark_reclaim_reboot(clp, state);
4138 lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
4139 };
4140}
4141
4091static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int recovery_type) 4142static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int recovery_type)
4092{ 4143{
4093 struct nfs4_lockdata *data; 4144 struct nfs4_lockdata *data;
@@ -4126,6 +4177,9 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
4126 ret = nfs4_wait_for_completion_rpc_task(task); 4177 ret = nfs4_wait_for_completion_rpc_task(task);
4127 if (ret == 0) { 4178 if (ret == 0) {
4128 ret = data->rpc_status; 4179 ret = data->rpc_status;
4180 if (ret)
4181 nfs4_handle_setlk_error(data->server, data->lsp,
4182 data->arg.new_lock_owner, ret);
4129 } else 4183 } else
4130 data->cancelled = 1; 4184 data->cancelled = 1;
4131 rpc_put_task(task); 4185 rpc_put_task(task);
@@ -4144,7 +4198,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
4144 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) 4198 if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
4145 return 0; 4199 return 0;
4146 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); 4200 err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
4147 if (err != -NFS4ERR_DELAY) 4201 if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED)
4148 break; 4202 break;
4149 nfs4_handle_exception(server, err, &exception); 4203 nfs4_handle_exception(server, err, &exception);
4150 } while (exception.retry); 4204 } while (exception.retry);
@@ -4169,6 +4223,7 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
4169 goto out; 4223 goto out;
4170 case -NFS4ERR_GRACE: 4224 case -NFS4ERR_GRACE:
4171 case -NFS4ERR_DELAY: 4225 case -NFS4ERR_DELAY:
4226 case -EKEYEXPIRED:
4172 nfs4_handle_exception(server, err, &exception); 4227 nfs4_handle_exception(server, err, &exception);
4173 err = 0; 4228 err = 0;
4174 } 4229 }
@@ -4181,8 +4236,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
4181{ 4236{
4182 struct nfs_inode *nfsi = NFS_I(state->inode); 4237 struct nfs_inode *nfsi = NFS_I(state->inode);
4183 unsigned char fl_flags = request->fl_flags; 4238 unsigned char fl_flags = request->fl_flags;
4184 int status; 4239 int status = -ENOLCK;
4185 4240
4241 if ((fl_flags & FL_POSIX) &&
4242 !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
4243 goto out;
4186 /* Is this a delegated open? */ 4244 /* Is this a delegated open? */
4187 status = nfs4_set_lock_state(state, request); 4245 status = nfs4_set_lock_state(state, request);
4188 if (status != 0) 4246 if (status != 0)
@@ -4317,6 +4375,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
4317 err = 0; 4375 err = 0;
4318 goto out; 4376 goto out;
4319 case -NFS4ERR_DELAY: 4377 case -NFS4ERR_DELAY:
4378 case -EKEYEXPIRED:
4320 break; 4379 break;
4321 } 4380 }
4322 err = nfs4_handle_exception(server, err, &exception); 4381 err = nfs4_handle_exception(server, err, &exception);
@@ -4462,7 +4521,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
4462 4521
4463 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); 4522 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
4464 4523
4465 if (status != NFS4ERR_CLID_INUSE) 4524 if (status != -NFS4ERR_CLID_INUSE)
4466 break; 4525 break;
4467 4526
4468 if (signalled()) 4527 if (signalled())
@@ -4516,6 +4575,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
4516 switch (task->tk_status) { 4575 switch (task->tk_status) {
4517 case -NFS4ERR_DELAY: 4576 case -NFS4ERR_DELAY:
4518 case -NFS4ERR_GRACE: 4577 case -NFS4ERR_GRACE:
4578 case -EKEYEXPIRED:
4519 dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); 4579 dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);
4520 rpc_delay(task, NFS4_POLL_RETRY_MIN); 4580 rpc_delay(task, NFS4_POLL_RETRY_MIN);
4521 task->tk_status = 0; 4581 task->tk_status = 0;
@@ -4573,26 +4633,32 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
4573/* 4633/*
4574 * Reset a slot table 4634 * Reset a slot table
4575 */ 4635 */
4576static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, int max_slots, 4636static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
4577 int old_max_slots, int ivalue) 4637 int ivalue)
4578{ 4638{
4639 struct nfs4_slot *new = NULL;
4579 int i; 4640 int i;
4580 int ret = 0; 4641 int ret = 0;
4581 4642
4582 dprintk("--> %s: max_reqs=%u, tbl %p\n", __func__, max_slots, tbl); 4643 dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
4644 max_reqs, tbl->max_slots);
4583 4645
4584 /* 4646 /* Does the newly negotiated max_reqs match the existing slot table? */
4585 * Until we have dynamic slot table adjustment, insist 4647 if (max_reqs != tbl->max_slots) {
4586 * upon the same slot table size 4648 ret = -ENOMEM;
4587 */ 4649 new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
4588 if (max_slots != old_max_slots) { 4650 GFP_KERNEL);
4589 dprintk("%s reset slot table does't match old\n", 4651 if (!new)
4590 __func__); 4652 goto out;
4591 ret = -EINVAL; /*XXX NFS4ERR_REQ_TOO_BIG ? */ 4653 ret = 0;
4592 goto out; 4654 kfree(tbl->slots);
4593 } 4655 }
4594 spin_lock(&tbl->slot_tbl_lock); 4656 spin_lock(&tbl->slot_tbl_lock);
4595 for (i = 0; i < max_slots; ++i) 4657 if (new) {
4658 tbl->slots = new;
4659 tbl->max_slots = max_reqs;
4660 }
4661 for (i = 0; i < tbl->max_slots; ++i)
4596 tbl->slots[i].seq_nr = ivalue; 4662 tbl->slots[i].seq_nr = ivalue;
4597 spin_unlock(&tbl->slot_tbl_lock); 4663 spin_unlock(&tbl->slot_tbl_lock);
4598 dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, 4664 dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
@@ -4610,16 +4676,12 @@ static int nfs4_reset_slot_tables(struct nfs4_session *session)
4610 int status; 4676 int status;
4611 4677
4612 status = nfs4_reset_slot_table(&session->fc_slot_table, 4678 status = nfs4_reset_slot_table(&session->fc_slot_table,
4613 session->fc_attrs.max_reqs, 4679 session->fc_attrs.max_reqs, 1);
4614 session->fc_slot_table.max_slots,
4615 1);
4616 if (status) 4680 if (status)
4617 return status; 4681 return status;
4618 4682
4619 status = nfs4_reset_slot_table(&session->bc_slot_table, 4683 status = nfs4_reset_slot_table(&session->bc_slot_table,
4620 session->bc_attrs.max_reqs, 4684 session->bc_attrs.max_reqs, 0);
4621 session->bc_slot_table.max_slots,
4622 0);
4623 return status; 4685 return status;
4624} 4686}
4625 4687
@@ -4760,16 +4822,14 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
4760 args->fc_attrs.headerpadsz = 0; 4822 args->fc_attrs.headerpadsz = 0;
4761 args->fc_attrs.max_rqst_sz = mxrqst_sz; 4823 args->fc_attrs.max_rqst_sz = mxrqst_sz;
4762 args->fc_attrs.max_resp_sz = mxresp_sz; 4824 args->fc_attrs.max_resp_sz = mxresp_sz;
4763 args->fc_attrs.max_resp_sz_cached = mxresp_sz;
4764 args->fc_attrs.max_ops = NFS4_MAX_OPS; 4825 args->fc_attrs.max_ops = NFS4_MAX_OPS;
4765 args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs; 4826 args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs;
4766 4827
4767 dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u " 4828 dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
4768 "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n", 4829 "max_ops=%u max_reqs=%u\n",
4769 __func__, 4830 __func__,
4770 args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz, 4831 args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz,
4771 args->fc_attrs.max_resp_sz_cached, args->fc_attrs.max_ops, 4832 args->fc_attrs.max_ops, args->fc_attrs.max_reqs);
4772 args->fc_attrs.max_reqs);
4773 4833
4774 /* Back channel attributes */ 4834 /* Back channel attributes */
4775 args->bc_attrs.headerpadsz = 0; 4835 args->bc_attrs.headerpadsz = 0;
@@ -4978,7 +5038,16 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
4978 &res, args.sa_cache_this, 1); 5038 &res, args.sa_cache_this, 1);
4979} 5039}
4980 5040
4981void nfs41_sequence_call_done(struct rpc_task *task, void *data) 5041static void nfs41_sequence_release(void *data)
5042{
5043 struct nfs_client *clp = (struct nfs_client *)data;
5044
5045 if (atomic_read(&clp->cl_count) > 1)
5046 nfs4_schedule_state_renewal(clp);
5047 nfs_put_client(clp);
5048}
5049
5050static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
4982{ 5051{
4983 struct nfs_client *clp = (struct nfs_client *)data; 5052 struct nfs_client *clp = (struct nfs_client *)data;
4984 5053
@@ -4986,6 +5055,8 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
4986 5055
4987 if (task->tk_status < 0) { 5056 if (task->tk_status < 0) {
4988 dprintk("%s ERROR %d\n", __func__, task->tk_status); 5057 dprintk("%s ERROR %d\n", __func__, task->tk_status);
5058 if (atomic_read(&clp->cl_count) == 1)
5059 goto out;
4989 5060
4990 if (_nfs4_async_handle_error(task, NULL, clp, NULL) 5061 if (_nfs4_async_handle_error(task, NULL, clp, NULL)
4991 == -EAGAIN) { 5062 == -EAGAIN) {
@@ -4994,7 +5065,7 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
4994 } 5065 }
4995 } 5066 }
4996 dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); 5067 dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
4997 5068out:
4998 kfree(task->tk_msg.rpc_argp); 5069 kfree(task->tk_msg.rpc_argp);
4999 kfree(task->tk_msg.rpc_resp); 5070 kfree(task->tk_msg.rpc_resp);
5000 5071
@@ -5019,6 +5090,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
5019static const struct rpc_call_ops nfs41_sequence_ops = { 5090static const struct rpc_call_ops nfs41_sequence_ops = {
5020 .rpc_call_done = nfs41_sequence_call_done, 5091 .rpc_call_done = nfs41_sequence_call_done,
5021 .rpc_call_prepare = nfs41_sequence_prepare, 5092 .rpc_call_prepare = nfs41_sequence_prepare,
5093 .rpc_release = nfs41_sequence_release,
5022}; 5094};
5023 5095
5024static int nfs41_proc_async_sequence(struct nfs_client *clp, 5096static int nfs41_proc_async_sequence(struct nfs_client *clp,
@@ -5031,12 +5103,14 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
5031 .rpc_cred = cred, 5103 .rpc_cred = cred,
5032 }; 5104 };
5033 5105
5106 if (!atomic_inc_not_zero(&clp->cl_count))
5107 return -EIO;
5034 args = kzalloc(sizeof(*args), GFP_KERNEL); 5108 args = kzalloc(sizeof(*args), GFP_KERNEL);
5035 if (!args)
5036 return -ENOMEM;
5037 res = kzalloc(sizeof(*res), GFP_KERNEL); 5109 res = kzalloc(sizeof(*res), GFP_KERNEL);
5038 if (!res) { 5110 if (!args || !res) {
5039 kfree(args); 5111 kfree(args);
5112 kfree(res);
5113 nfs_put_client(clp);
5040 return -ENOMEM; 5114 return -ENOMEM;
5041 } 5115 }
5042 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 5116 res->sr_slotid = NFS4_MAX_SLOT_TABLE;
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 0156c01c212c..d87f10327b72 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -36,11 +36,6 @@
36 * as an rpc_task, not a real kernel thread, so it always runs in rpciod's 36 * as an rpc_task, not a real kernel thread, so it always runs in rpciod's
37 * context. There is one renewd per nfs_server. 37 * context. There is one renewd per nfs_server.
38 * 38 *
39 * TODO: If the send queue gets backlogged (e.g., if the server goes down),
40 * we will keep filling the queue with periodic RENEW requests. We need a
41 * mechanism for ensuring that if renewd successfully sends off a request,
42 * then it only wakes up when the request is finished. Maybe use the
43 * child task framework of the RPC layer?
44 */ 39 */
45 40
46#include <linux/mm.h> 41#include <linux/mm.h>
@@ -63,7 +58,7 @@ nfs4_renew_state(struct work_struct *work)
63 struct nfs_client *clp = 58 struct nfs_client *clp =
64 container_of(work, struct nfs_client, cl_renewd.work); 59 container_of(work, struct nfs_client, cl_renewd.work);
65 struct rpc_cred *cred; 60 struct rpc_cred *cred;
66 long lease, timeout; 61 long lease;
67 unsigned long last, now; 62 unsigned long last, now;
68 63
69 ops = nfs4_state_renewal_ops[clp->cl_minorversion]; 64 ops = nfs4_state_renewal_ops[clp->cl_minorversion];
@@ -75,7 +70,6 @@ nfs4_renew_state(struct work_struct *work)
75 lease = clp->cl_lease_time; 70 lease = clp->cl_lease_time;
76 last = clp->cl_last_renewal; 71 last = clp->cl_last_renewal;
77 now = jiffies; 72 now = jiffies;
78 timeout = (2 * lease) / 3 + (long)last - (long)now;
79 /* Are we close to a lease timeout? */ 73 /* Are we close to a lease timeout? */
80 if (time_after(now, last + lease/3)) { 74 if (time_after(now, last + lease/3)) {
81 cred = ops->get_state_renewal_cred_locked(clp); 75 cred = ops->get_state_renewal_cred_locked(clp);
@@ -90,19 +84,15 @@ nfs4_renew_state(struct work_struct *work)
90 /* Queue an asynchronous RENEW. */ 84 /* Queue an asynchronous RENEW. */
91 ops->sched_state_renewal(clp, cred); 85 ops->sched_state_renewal(clp, cred);
92 put_rpccred(cred); 86 put_rpccred(cred);
87 goto out_exp;
93 } 88 }
94 timeout = (2 * lease) / 3; 89 } else {
95 spin_lock(&clp->cl_lock);
96 } else
97 dprintk("%s: failed to call renewd. Reason: lease not expired \n", 90 dprintk("%s: failed to call renewd. Reason: lease not expired \n",
98 __func__); 91 __func__);
99 if (timeout < 5 * HZ) /* safeguard */ 92 spin_unlock(&clp->cl_lock);
100 timeout = 5 * HZ; 93 }
101 dprintk("%s: requeueing work. Lease period = %ld\n", 94 nfs4_schedule_state_renewal(clp);
102 __func__, (timeout + HZ - 1) / HZ); 95out_exp:
103 cancel_delayed_work(&clp->cl_renewd);
104 schedule_delayed_work(&clp->cl_renewd, timeout);
105 spin_unlock(&clp->cl_lock);
106 nfs_expire_unreferenced_delegations(clp); 96 nfs_expire_unreferenced_delegations(clp);
107out: 97out:
108 dprintk("%s: done\n", __func__); 98 dprintk("%s: done\n", __func__);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6d263ed79e92..6c5ed51f105e 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -901,7 +901,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp)
901 nfs4_schedule_state_manager(clp); 901 nfs4_schedule_state_manager(clp);
902} 902}
903 903
904static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) 904int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
905{ 905{
906 906
907 set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); 907 set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -1249,26 +1249,65 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
1249} 1249}
1250 1250
1251#ifdef CONFIG_NFS_V4_1 1251#ifdef CONFIG_NFS_V4_1
1252void nfs41_handle_recall_slot(struct nfs_client *clp)
1253{
1254 set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
1255 nfs4_schedule_state_recovery(clp);
1256}
1257
1258static void nfs4_reset_all_state(struct nfs_client *clp)
1259{
1260 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
1261 clp->cl_boot_time = CURRENT_TIME;
1262 nfs4_state_start_reclaim_nograce(clp);
1263 nfs4_schedule_state_recovery(clp);
1264 }
1265}
1266
1267static void nfs41_handle_server_reboot(struct nfs_client *clp)
1268{
1269 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
1270 nfs4_state_start_reclaim_reboot(clp);
1271 nfs4_schedule_state_recovery(clp);
1272 }
1273}
1274
1275static void nfs41_handle_state_revoked(struct nfs_client *clp)
1276{
1277 /* Temporary */
1278 nfs4_reset_all_state(clp);
1279}
1280
1281static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp)
1282{
1283 /* This will need to handle layouts too */
1284 nfs_expire_all_delegations(clp);
1285}
1286
1287static void nfs41_handle_cb_path_down(struct nfs_client *clp)
1288{
1289 nfs_expire_all_delegations(clp);
1290 if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
1291 nfs4_schedule_state_recovery(clp);
1292}
1293
1252void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) 1294void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
1253{ 1295{
1254 if (!flags) 1296 if (!flags)
1255 return; 1297 return;
1256 else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) { 1298 else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
1257 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 1299 nfs41_handle_server_reboot(clp);
1258 nfs4_state_start_reclaim_reboot(clp); 1300 else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
1259 nfs4_schedule_state_recovery(clp);
1260 } else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
1261 SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | 1301 SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
1262 SEQ4_STATUS_ADMIN_STATE_REVOKED | 1302 SEQ4_STATUS_ADMIN_STATE_REVOKED |
1263 SEQ4_STATUS_RECALLABLE_STATE_REVOKED | 1303 SEQ4_STATUS_LEASE_MOVED))
1264 SEQ4_STATUS_LEASE_MOVED)) { 1304 nfs41_handle_state_revoked(clp);
1265 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 1305 else if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
1266 nfs4_state_start_reclaim_nograce(clp); 1306 nfs41_handle_recallable_state_revoked(clp);
1267 nfs4_schedule_state_recovery(clp); 1307 else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
1268 } else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
1269 SEQ4_STATUS_BACKCHANNEL_FAULT | 1308 SEQ4_STATUS_BACKCHANNEL_FAULT |
1270 SEQ4_STATUS_CB_PATH_DOWN_SESSION)) 1309 SEQ4_STATUS_CB_PATH_DOWN_SESSION))
1271 nfs_expire_all_delegations(clp); 1310 nfs41_handle_cb_path_down(clp);
1272} 1311}
1273 1312
1274static int nfs4_reset_session(struct nfs_client *clp) 1313static int nfs4_reset_session(struct nfs_client *clp)
@@ -1285,23 +1324,52 @@ static int nfs4_reset_session(struct nfs_client *clp)
1285 1324
1286 memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN); 1325 memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN);
1287 status = nfs4_proc_create_session(clp); 1326 status = nfs4_proc_create_session(clp);
1288 if (status) 1327 if (status) {
1289 status = nfs4_recovery_handle_error(clp, status); 1328 status = nfs4_recovery_handle_error(clp, status);
1329 goto out;
1330 }
1331 /* create_session negotiated new slot table */
1332 clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
1290 1333
1291out: 1334 /* Let the state manager reestablish state */
1292 /* 1335 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
1293 * Let the state manager reestablish state
1294 */
1295 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
1296 status == 0)
1297 nfs41_setup_state_renewal(clp); 1336 nfs41_setup_state_renewal(clp);
1298 1337out:
1299 return status; 1338 return status;
1300} 1339}
1301 1340
1341static int nfs4_recall_slot(struct nfs_client *clp)
1342{
1343 struct nfs4_slot_table *fc_tbl = &clp->cl_session->fc_slot_table;
1344 struct nfs4_channel_attrs *fc_attrs = &clp->cl_session->fc_attrs;
1345 struct nfs4_slot *new, *old;
1346 int i;
1347
1348 nfs4_begin_drain_session(clp);
1349 new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
1350 GFP_KERNEL);
1351 if (!new)
1352 return -ENOMEM;
1353
1354 spin_lock(&fc_tbl->slot_tbl_lock);
1355 for (i = 0; i < fc_tbl->target_max_slots; i++)
1356 new[i].seq_nr = fc_tbl->slots[i].seq_nr;
1357 old = fc_tbl->slots;
1358 fc_tbl->slots = new;
1359 fc_tbl->max_slots = fc_tbl->target_max_slots;
1360 fc_tbl->target_max_slots = 0;
1361 fc_attrs->max_reqs = fc_tbl->max_slots;
1362 spin_unlock(&fc_tbl->slot_tbl_lock);
1363
1364 kfree(old);
1365 nfs4_end_drain_session(clp);
1366 return 0;
1367}
1368
1302#else /* CONFIG_NFS_V4_1 */ 1369#else /* CONFIG_NFS_V4_1 */
1303static int nfs4_reset_session(struct nfs_client *clp) { return 0; } 1370static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
1304static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; } 1371static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
1372static int nfs4_recall_slot(struct nfs_client *clp) { return 0; }
1305#endif /* CONFIG_NFS_V4_1 */ 1373#endif /* CONFIG_NFS_V4_1 */
1306 1374
1307/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors 1375/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
@@ -1314,6 +1382,7 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status)
1314 case -NFS4ERR_DELAY: 1382 case -NFS4ERR_DELAY:
1315 case -NFS4ERR_CLID_INUSE: 1383 case -NFS4ERR_CLID_INUSE:
1316 case -EAGAIN: 1384 case -EAGAIN:
1385 case -EKEYEXPIRED:
1317 break; 1386 break;
1318 1387
1319 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery 1388 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
@@ -1397,6 +1466,15 @@ static void nfs4_state_manager(struct nfs_client *clp)
1397 nfs_client_return_marked_delegations(clp); 1466 nfs_client_return_marked_delegations(clp);
1398 continue; 1467 continue;
1399 } 1468 }
1469 /* Recall session slots */
1470 if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)
1471 && nfs4_has_session(clp)) {
1472 status = nfs4_recall_slot(clp);
1473 if (status < 0)
1474 goto out_error;
1475 continue;
1476 }
1477
1400 1478
1401 nfs4_clear_state_manager_bit(clp); 1479 nfs4_clear_state_manager_bit(clp);
1402 /* Did we race with an attempt to give us more work? */ 1480 /* Did we race with an attempt to give us more work? */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e437fd6a819f..38f3b582e7c2 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -38,7 +38,6 @@
38#include <linux/param.h> 38#include <linux/param.h>
39#include <linux/time.h> 39#include <linux/time.h>
40#include <linux/mm.h> 40#include <linux/mm.h>
41#include <linux/slab.h>
42#include <linux/errno.h> 41#include <linux/errno.h>
43#include <linux/string.h> 42#include <linux/string.h>
44#include <linux/in.h> 43#include <linux/in.h>
@@ -1578,6 +1577,14 @@ static void encode_create_session(struct xdr_stream *xdr,
1578 char machine_name[NFS4_MAX_MACHINE_NAME_LEN]; 1577 char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
1579 uint32_t len; 1578 uint32_t len;
1580 struct nfs_client *clp = args->client; 1579 struct nfs_client *clp = args->client;
1580 u32 max_resp_sz_cached;
1581
1582 /*
1583 * Assumes OPEN is the biggest non-idempotent compound.
1584 * 2 is the verifier.
1585 */
1586 max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE +
1587 RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT;
1581 1588
1582 len = scnprintf(machine_name, sizeof(machine_name), "%s", 1589 len = scnprintf(machine_name, sizeof(machine_name), "%s",
1583 clp->cl_ipaddr); 1590 clp->cl_ipaddr);
@@ -1592,7 +1599,7 @@ static void encode_create_session(struct xdr_stream *xdr,
1592 *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ 1599 *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
1593 *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */ 1600 *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */
1594 *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */ 1601 *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */
1595 *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ 1602 *p++ = cpu_to_be32(max_resp_sz_cached); /* Max resp sz cached */
1596 *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */ 1603 *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */
1597 *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */ 1604 *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */
1598 *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ 1605 *p++ = cpu_to_be32(0); /* rdmachannel_attrs */
@@ -4631,7 +4638,7 @@ static int decode_sequence(struct xdr_stream *xdr,
4631 * If the server returns different values for sessionID, slotID or 4638 * If the server returns different values for sessionID, slotID or
4632 * sequence number, the server is looney tunes. 4639 * sequence number, the server is looney tunes.
4633 */ 4640 */
4634 status = -ESERVERFAULT; 4641 status = -EREMOTEIO;
4635 4642
4636 if (memcmp(id.data, res->sr_session->sess_id.data, 4643 if (memcmp(id.data, res->sr_session->sess_id.data,
4637 NFS4_MAX_SESSIONID_LEN)) { 4644 NFS4_MAX_SESSIONID_LEN)) {
@@ -5544,6 +5551,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
5544 if (status != 0) 5551 if (status != 0)
5545 goto out; 5552 goto out;
5546 status = decode_delegreturn(&xdr); 5553 status = decode_delegreturn(&xdr);
5554 if (status != 0)
5555 goto out;
5547 decode_getfattr(&xdr, res->fattr, res->server, 5556 decode_getfattr(&xdr, res->fattr, res->server,
5548 !RPC_IS_ASYNC(rqstp->rq_task)); 5557 !RPC_IS_ASYNC(rqstp->rq_task));
5549out: 5558out:
@@ -5774,7 +5783,7 @@ static struct {
5774 { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, 5783 { NFS4ERR_BAD_COOKIE, -EBADCOOKIE },
5775 { NFS4ERR_NOTSUPP, -ENOTSUPP }, 5784 { NFS4ERR_NOTSUPP, -ENOTSUPP },
5776 { NFS4ERR_TOOSMALL, -ETOOSMALL }, 5785 { NFS4ERR_TOOSMALL, -ETOOSMALL },
5777 { NFS4ERR_SERVERFAULT, -ESERVERFAULT }, 5786 { NFS4ERR_SERVERFAULT, -EREMOTEIO },
5778 { NFS4ERR_BADTYPE, -EBADTYPE }, 5787 { NFS4ERR_BADTYPE, -EBADTYPE },
5779 { NFS4ERR_LOCKED, -EAGAIN }, 5788 { NFS4ERR_LOCKED, -EAGAIN },
5780 { NFS4ERR_SYMLINK, -ELOOP }, 5789 { NFS4ERR_SYMLINK, -ELOOP },
@@ -5801,7 +5810,7 @@ nfs4_stat_to_errno(int stat)
5801 } 5810 }
5802 if (stat <= 10000 || stat > 10100) { 5811 if (stat <= 10000 || stat > 10100) {
5803 /* The server is looney tunes. */ 5812 /* The server is looney tunes. */
5804 return -ESERVERFAULT; 5813 return -EREMOTEIO;
5805 } 5814 }
5806 /* If we cannot translate the error, the recovery routines should 5815 /* If we cannot translate the error, the recovery routines should
5807 * handle it. 5816 * handle it.
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e2975939126a..29d9d36cd5f4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -112,12 +112,10 @@ void nfs_unlock_request(struct nfs_page *req)
112 */ 112 */
113int nfs_set_page_tag_locked(struct nfs_page *req) 113int nfs_set_page_tag_locked(struct nfs_page *req)
114{ 114{
115 struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
116
117 if (!nfs_lock_request_dontget(req)) 115 if (!nfs_lock_request_dontget(req))
118 return 0; 116 return 0;
119 if (req->wb_page != NULL) 117 if (req->wb_page != NULL)
120 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 118 radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
121 return 1; 119 return 1;
122} 120}
123 121
@@ -126,10 +124,10 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
126 */ 124 */
127void nfs_clear_page_tag_locked(struct nfs_page *req) 125void nfs_clear_page_tag_locked(struct nfs_page *req)
128{ 126{
129 struct inode *inode = req->wb_context->path.dentry->d_inode;
130 struct nfs_inode *nfsi = NFS_I(inode);
131
132 if (req->wb_page != NULL) { 127 if (req->wb_page != NULL) {
128 struct inode *inode = req->wb_context->path.dentry->d_inode;
129 struct nfs_inode *nfsi = NFS_I(inode);
130
133 spin_lock(&inode->i_lock); 131 spin_lock(&inode->i_lock);
134 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 132 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
135 nfs_unlock_request(req); 133 nfs_unlock_request(req);
@@ -142,16 +140,22 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
142 * nfs_clear_request - Free up all resources allocated to the request 140 * nfs_clear_request - Free up all resources allocated to the request
143 * @req: 141 * @req:
144 * 142 *
145 * Release page resources associated with a write request after it 143 * Release page and open context resources associated with a read/write
146 * has completed. 144 * request after it has completed.
147 */ 145 */
148void nfs_clear_request(struct nfs_page *req) 146void nfs_clear_request(struct nfs_page *req)
149{ 147{
150 struct page *page = req->wb_page; 148 struct page *page = req->wb_page;
149 struct nfs_open_context *ctx = req->wb_context;
150
151 if (page != NULL) { 151 if (page != NULL) {
152 page_cache_release(page); 152 page_cache_release(page);
153 req->wb_page = NULL; 153 req->wb_page = NULL;
154 } 154 }
155 if (ctx != NULL) {
156 put_nfs_open_context(ctx);
157 req->wb_context = NULL;
158 }
155} 159}
156 160
157 161
@@ -165,9 +169,8 @@ static void nfs_free_request(struct kref *kref)
165{ 169{
166 struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); 170 struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
167 171
168 /* Release struct file or cached credential */ 172 /* Release struct file and open context */
169 nfs_clear_request(req); 173 nfs_clear_request(req);
170 put_nfs_open_context(req->wb_context);
171 nfs_page_free(req); 174 nfs_page_free(req);
172} 175}
173 176
@@ -176,6 +179,12 @@ void nfs_release_request(struct nfs_page *req)
176 kref_put(&req->wb_kref, nfs_free_request); 179 kref_put(&req->wb_kref, nfs_free_request);
177} 180}
178 181
182static int nfs_wait_bit_uninterruptible(void *word)
183{
184 io_schedule();
185 return 0;
186}
187
179/** 188/**
180 * nfs_wait_on_request - Wait for a request to complete. 189 * nfs_wait_on_request - Wait for a request to complete.
181 * @req: request to wait upon. 190 * @req: request to wait upon.
@@ -186,14 +195,9 @@ void nfs_release_request(struct nfs_page *req)
186int 195int
187nfs_wait_on_request(struct nfs_page *req) 196nfs_wait_on_request(struct nfs_page *req)
188{ 197{
189 int ret = 0; 198 return wait_on_bit(&req->wb_flags, PG_BUSY,
190 199 nfs_wait_bit_uninterruptible,
191 if (!test_bit(PG_BUSY, &req->wb_flags)) 200 TASK_UNINTERRUPTIBLE);
192 goto out;
193 ret = out_of_line_wait_on_bit(&req->wb_flags, PG_BUSY,
194 nfs_wait_bit_killable, TASK_KILLABLE);
195out:
196 return ret;
197} 201}
198 202
199/** 203/**
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ef583854d8d0..0288be80444f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -29,7 +29,6 @@
29 29
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/param.h> 31#include <linux/param.h>
32#include <linux/slab.h>
33#include <linux/time.h> 32#include <linux/time.h>
34#include <linux/mm.h> 33#include <linux/mm.h>
35#include <linux/errno.h> 34#include <linux/errno.h>
@@ -47,6 +46,39 @@
47#define NFSDBG_FACILITY NFSDBG_PROC 46#define NFSDBG_FACILITY NFSDBG_PROC
48 47
49/* 48/*
49 * wrapper to handle the -EKEYEXPIRED error message. This should generally
50 * only happen if using krb5 auth and a user's TGT expires. NFSv2 doesn't
51 * support the NFSERR_JUKEBOX error code, but we handle this situation in the
52 * same way that we handle that error with NFSv3.
53 */
54static int
55nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
56{
57 int res;
58 do {
59 res = rpc_call_sync(clnt, msg, flags);
60 if (res != -EKEYEXPIRED)
61 break;
62 schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
63 res = -ERESTARTSYS;
64 } while (!fatal_signal_pending(current));
65 return res;
66}
67
68#define rpc_call_sync(clnt, msg, flags) nfs_rpc_wrapper(clnt, msg, flags)
69
70static int
71nfs_async_handle_expired_key(struct rpc_task *task)
72{
73 if (task->tk_status != -EKEYEXPIRED)
74 return 0;
75 task->tk_status = 0;
76 rpc_restart_call(task);
77 rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
78 return 1;
79}
80
81/*
50 * Bare-bones access to getattr: this is for nfs_read_super. 82 * Bare-bones access to getattr: this is for nfs_read_super.
51 */ 83 */
52static int 84static int
@@ -307,6 +339,8 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
307 339
308static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) 340static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
309{ 341{
342 if (nfs_async_handle_expired_key(task))
343 return 0;
310 nfs_mark_for_revalidate(dir); 344 nfs_mark_for_revalidate(dir);
311 return 1; 345 return 1;
312} 346}
@@ -560,6 +594,9 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
560 594
561static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) 595static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
562{ 596{
597 if (nfs_async_handle_expired_key(task))
598 return -EAGAIN;
599
563 nfs_invalidate_atime(data->inode); 600 nfs_invalidate_atime(data->inode);
564 if (task->tk_status >= 0) { 601 if (task->tk_status >= 0) {
565 nfs_refresh_inode(data->inode, data->res.fattr); 602 nfs_refresh_inode(data->inode, data->res.fattr);
@@ -579,6 +616,9 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *
579 616
580static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) 617static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
581{ 618{
619 if (nfs_async_handle_expired_key(task))
620 return -EAGAIN;
621
582 if (task->tk_status >= 0) 622 if (task->tk_status >= 0)
583 nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr); 623 nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr);
584 return 0; 624 return 0;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ce907efc5508..e01637240eeb 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -48,6 +48,7 @@
48#include <linux/vfs.h> 48#include <linux/vfs.h>
49#include <linux/inet.h> 49#include <linux/inet.h>
50#include <linux/in6.h> 50#include <linux/in6.h>
51#include <linux/slab.h>
51#include <net/ipv6.h> 52#include <net/ipv6.h>
52#include <linux/netdevice.h> 53#include <linux/netdevice.h>
53#include <linux/nfs_xdr.h> 54#include <linux/nfs_xdr.h>
@@ -243,6 +244,7 @@ static int nfs_show_stats(struct seq_file *, struct vfsmount *);
243static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *); 244static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
244static int nfs_xdev_get_sb(struct file_system_type *fs_type, 245static int nfs_xdev_get_sb(struct file_system_type *fs_type,
245 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 246 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
247static void nfs_put_super(struct super_block *);
246static void nfs_kill_super(struct super_block *); 248static void nfs_kill_super(struct super_block *);
247static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); 249static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
248 250
@@ -266,6 +268,7 @@ static const struct super_operations nfs_sops = {
266 .alloc_inode = nfs_alloc_inode, 268 .alloc_inode = nfs_alloc_inode,
267 .destroy_inode = nfs_destroy_inode, 269 .destroy_inode = nfs_destroy_inode,
268 .write_inode = nfs_write_inode, 270 .write_inode = nfs_write_inode,
271 .put_super = nfs_put_super,
269 .statfs = nfs_statfs, 272 .statfs = nfs_statfs,
270 .clear_inode = nfs_clear_inode, 273 .clear_inode = nfs_clear_inode,
271 .umount_begin = nfs_umount_begin, 274 .umount_begin = nfs_umount_begin,
@@ -335,6 +338,7 @@ static const struct super_operations nfs4_sops = {
335 .alloc_inode = nfs_alloc_inode, 338 .alloc_inode = nfs_alloc_inode,
336 .destroy_inode = nfs_destroy_inode, 339 .destroy_inode = nfs_destroy_inode,
337 .write_inode = nfs_write_inode, 340 .write_inode = nfs_write_inode,
341 .put_super = nfs_put_super,
338 .statfs = nfs_statfs, 342 .statfs = nfs_statfs,
339 .clear_inode = nfs4_clear_inode, 343 .clear_inode = nfs4_clear_inode,
340 .umount_begin = nfs_umount_begin, 344 .umount_begin = nfs_umount_begin,
@@ -2211,7 +2215,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2211 } else { 2215 } else {
2212 error = nfs_bdi_register(server); 2216 error = nfs_bdi_register(server);
2213 if (error) 2217 if (error)
2214 goto error_splat_super; 2218 goto error_splat_bdi;
2215 } 2219 }
2216 2220
2217 if (!s->s_root) { 2221 if (!s->s_root) {
@@ -2253,11 +2257,25 @@ out_err_nosb:
2253error_splat_root: 2257error_splat_root:
2254 dput(mntroot); 2258 dput(mntroot);
2255error_splat_super: 2259error_splat_super:
2260 if (server && !s->s_root)
2261 bdi_unregister(&server->backing_dev_info);
2262error_splat_bdi:
2256 deactivate_locked_super(s); 2263 deactivate_locked_super(s);
2257 goto out; 2264 goto out;
2258} 2265}
2259 2266
2260/* 2267/*
2268 * Ensure that we unregister the bdi before kill_anon_super
2269 * releases the device name
2270 */
2271static void nfs_put_super(struct super_block *s)
2272{
2273 struct nfs_server *server = NFS_SB(s);
2274
2275 bdi_unregister(&server->backing_dev_info);
2276}
2277
2278/*
2261 * Destroy an NFS2/3 superblock 2279 * Destroy an NFS2/3 superblock
2262 */ 2280 */
2263static void nfs_kill_super(struct super_block *s) 2281static void nfs_kill_super(struct super_block *s)
@@ -2265,7 +2283,6 @@ static void nfs_kill_super(struct super_block *s)
2265 struct nfs_server *server = NFS_SB(s); 2283 struct nfs_server *server = NFS_SB(s);
2266 2284
2267 kill_anon_super(s); 2285 kill_anon_super(s);
2268 bdi_unregister(&server->backing_dev_info);
2269 nfs_fscache_release_super_cookie(s); 2286 nfs_fscache_release_super_cookie(s);
2270 nfs_free_server(server); 2287 nfs_free_server(server);
2271} 2288}
@@ -2313,7 +2330,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
2313 } else { 2330 } else {
2314 error = nfs_bdi_register(server); 2331 error = nfs_bdi_register(server);
2315 if (error) 2332 if (error)
2316 goto error_splat_super; 2333 goto error_splat_bdi;
2317 } 2334 }
2318 2335
2319 if (!s->s_root) { 2336 if (!s->s_root) {
@@ -2350,6 +2367,9 @@ out_err_noserver:
2350 return error; 2367 return error;
2351 2368
2352error_splat_super: 2369error_splat_super:
2370 if (server && !s->s_root)
2371 bdi_unregister(&server->backing_dev_info);
2372error_splat_bdi:
2353 deactivate_locked_super(s); 2373 deactivate_locked_super(s);
2354 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error); 2374 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
2355 return error; 2375 return error;
@@ -2565,7 +2585,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2565 } else { 2585 } else {
2566 error = nfs_bdi_register(server); 2586 error = nfs_bdi_register(server);
2567 if (error) 2587 if (error)
2568 goto error_splat_super; 2588 goto error_splat_bdi;
2569 } 2589 }
2570 2590
2571 if (!s->s_root) { 2591 if (!s->s_root) {
@@ -2603,6 +2623,9 @@ out_free:
2603error_splat_root: 2623error_splat_root:
2604 dput(mntroot); 2624 dput(mntroot);
2605error_splat_super: 2625error_splat_super:
2626 if (server && !s->s_root)
2627 bdi_unregister(&server->backing_dev_info);
2628error_splat_bdi:
2606 deactivate_locked_super(s); 2629 deactivate_locked_super(s);
2607 goto out; 2630 goto out;
2608} 2631}
@@ -2798,7 +2821,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
2798 } else { 2821 } else {
2799 error = nfs_bdi_register(server); 2822 error = nfs_bdi_register(server);
2800 if (error) 2823 if (error)
2801 goto error_splat_super; 2824 goto error_splat_bdi;
2802 } 2825 }
2803 2826
2804 if (!s->s_root) { 2827 if (!s->s_root) {
@@ -2834,6 +2857,9 @@ out_err_noserver:
2834 return error; 2857 return error;
2835 2858
2836error_splat_super: 2859error_splat_super:
2860 if (server && !s->s_root)
2861 bdi_unregister(&server->backing_dev_info);
2862error_splat_bdi:
2837 deactivate_locked_super(s); 2863 deactivate_locked_super(s);
2838 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error); 2864 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
2839 return error; 2865 return error;
@@ -2880,7 +2906,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2880 } else { 2906 } else {
2881 error = nfs_bdi_register(server); 2907 error = nfs_bdi_register(server);
2882 if (error) 2908 if (error)
2883 goto error_splat_super; 2909 goto error_splat_bdi;
2884 } 2910 }
2885 2911
2886 if (!s->s_root) { 2912 if (!s->s_root) {
@@ -2916,6 +2942,9 @@ out_err_noserver:
2916 return error; 2942 return error;
2917 2943
2918error_splat_super: 2944error_splat_super:
2945 if (server && !s->s_root)
2946 bdi_unregister(&server->backing_dev_info);
2947error_splat_bdi:
2919 deactivate_locked_super(s); 2948 deactivate_locked_super(s);
2920 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); 2949 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
2921 return error; 2950 return error;
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 412738dbfbc7..05c9e02f4153 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -19,7 +19,6 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/stat.h> 20#include <linux/stat.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/slab.h>
23#include <linux/string.h> 22#include <linux/string.h>
24#include <linux/namei.h> 23#include <linux/namei.h>
25 24
@@ -50,7 +49,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
50 struct page *page; 49 struct page *page;
51 void *err; 50 void *err;
52 51
53 err = ERR_PTR(nfs_revalidate_mapping_nolock(inode, inode->i_mapping)); 52 err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
54 if (err) 53 if (err)
55 goto read_failed; 54 goto read_failed;
56 page = read_cache_page(&inode->i_data, 0, 55 page = read_cache_page(&inode->i_data, 0,
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index 70e1fbbaaeab..ad4d2e787b20 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -15,8 +15,10 @@
15 15
16#include "callback.h" 16#include "callback.h"
17 17
18#ifdef CONFIG_NFS_V4
18static const int nfs_set_port_min = 0; 19static const int nfs_set_port_min = 0;
19static const int nfs_set_port_max = 65535; 20static const int nfs_set_port_max = 65535;
21#endif
20static struct ctl_table_header *nfs_callback_sysctl_table; 22static struct ctl_table_header *nfs_callback_sysctl_table;
21 23
22static ctl_table nfs_cb_sysctls[] = { 24static ctl_table nfs_cb_sysctls[] = {
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d171696017f4..de38d63aa920 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -201,6 +201,7 @@ static int nfs_set_page_writeback(struct page *page)
201 struct inode *inode = page->mapping->host; 201 struct inode *inode = page->mapping->host;
202 struct nfs_server *nfss = NFS_SERVER(inode); 202 struct nfs_server *nfss = NFS_SERVER(inode);
203 203
204 page_cache_get(page);
204 if (atomic_long_inc_return(&nfss->writeback) > 205 if (atomic_long_inc_return(&nfss->writeback) >
205 NFS_CONGESTION_ON_THRESH) { 206 NFS_CONGESTION_ON_THRESH) {
206 set_bdi_congested(&nfss->backing_dev_info, 207 set_bdi_congested(&nfss->backing_dev_info,
@@ -216,6 +217,7 @@ static void nfs_end_page_writeback(struct page *page)
216 struct nfs_server *nfss = NFS_SERVER(inode); 217 struct nfs_server *nfss = NFS_SERVER(inode);
217 218
218 end_page_writeback(page); 219 end_page_writeback(page);
220 page_cache_release(page);
219 if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) 221 if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
220 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); 222 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
221} 223}
@@ -421,6 +423,7 @@ static void
421nfs_mark_request_dirty(struct nfs_page *req) 423nfs_mark_request_dirty(struct nfs_page *req)
422{ 424{
423 __set_page_dirty_nobuffers(req->wb_page); 425 __set_page_dirty_nobuffers(req->wb_page);
426 __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
424} 427}
425 428
426#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 429#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -438,6 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req)
438 radix_tree_tag_set(&nfsi->nfs_page_tree, 441 radix_tree_tag_set(&nfsi->nfs_page_tree,
439 req->wb_index, 442 req->wb_index,
440 NFS_PAGE_TAG_COMMIT); 443 NFS_PAGE_TAG_COMMIT);
444 nfsi->ncommit++;
441 spin_unlock(&inode->i_lock); 445 spin_unlock(&inode->i_lock);
442 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 446 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
443 inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); 447 inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
@@ -501,57 +505,6 @@ int nfs_reschedule_unstable_write(struct nfs_page *req)
501} 505}
502#endif 506#endif
503 507
504/*
505 * Wait for a request to complete.
506 *
507 * Interruptible by fatal signals only.
508 */
509static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, unsigned int npages)
510{
511 struct nfs_inode *nfsi = NFS_I(inode);
512 struct nfs_page *req;
513 pgoff_t idx_end, next;
514 unsigned int res = 0;
515 int error;
516
517 if (npages == 0)
518 idx_end = ~0;
519 else
520 idx_end = idx_start + npages - 1;
521
522 next = idx_start;
523 while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_LOCKED)) {
524 if (req->wb_index > idx_end)
525 break;
526
527 next = req->wb_index + 1;
528 BUG_ON(!NFS_WBACK_BUSY(req));
529
530 kref_get(&req->wb_kref);
531 spin_unlock(&inode->i_lock);
532 error = nfs_wait_on_request(req);
533 nfs_release_request(req);
534 spin_lock(&inode->i_lock);
535 if (error < 0)
536 return error;
537 res++;
538 }
539 return res;
540}
541
542static void nfs_cancel_commit_list(struct list_head *head)
543{
544 struct nfs_page *req;
545
546 while(!list_empty(head)) {
547 req = nfs_list_entry(head->next);
548 nfs_list_remove_request(req);
549 nfs_clear_request_commit(req);
550 nfs_inode_remove_request(req);
551 nfs_unlock_request(req);
552 }
553}
554
555#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 508#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
556static int 509static int
557nfs_need_commit(struct nfs_inode *nfsi) 510nfs_need_commit(struct nfs_inode *nfsi)
@@ -573,11 +526,17 @@ static int
573nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) 526nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
574{ 527{
575 struct nfs_inode *nfsi = NFS_I(inode); 528 struct nfs_inode *nfsi = NFS_I(inode);
529 int ret;
576 530
577 if (!nfs_need_commit(nfsi)) 531 if (!nfs_need_commit(nfsi))
578 return 0; 532 return 0;
579 533
580 return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); 534 ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
535 if (ret > 0)
536 nfsi->ncommit -= ret;
537 if (nfs_need_commit(NFS_I(inode)))
538 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
539 return ret;
581} 540}
582#else 541#else
583static inline int nfs_need_commit(struct nfs_inode *nfsi) 542static inline int nfs_need_commit(struct nfs_inode *nfsi)
@@ -642,9 +601,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
642 spin_lock(&inode->i_lock); 601 spin_lock(&inode->i_lock);
643 } 602 }
644 603
645 if (nfs_clear_request_commit(req)) 604 if (nfs_clear_request_commit(req) &&
646 radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, 605 radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
647 req->wb_index, NFS_PAGE_TAG_COMMIT); 606 req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL)
607 NFS_I(inode)->ncommit--;
648 608
649 /* Okay, the request matches. Update the region */ 609 /* Okay, the request matches. Update the region */
650 if (offset < req->wb_offset) { 610 if (offset < req->wb_offset) {
@@ -703,9 +663,11 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
703 req = nfs_setup_write_request(ctx, page, offset, count); 663 req = nfs_setup_write_request(ctx, page, offset, count);
704 if (IS_ERR(req)) 664 if (IS_ERR(req))
705 return PTR_ERR(req); 665 return PTR_ERR(req);
666 nfs_mark_request_dirty(req);
706 /* Update file length */ 667 /* Update file length */
707 nfs_grow_file(page, offset, count); 668 nfs_grow_file(page, offset, count);
708 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); 669 nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
670 nfs_mark_request_dirty(req);
709 nfs_clear_page_tag_locked(req); 671 nfs_clear_page_tag_locked(req);
710 return 0; 672 return 0;
711} 673}
@@ -782,8 +744,6 @@ int nfs_updatepage(struct file *file, struct page *page,
782 status = nfs_writepage_setup(ctx, page, offset, count); 744 status = nfs_writepage_setup(ctx, page, offset, count);
783 if (status < 0) 745 if (status < 0)
784 nfs_set_pageerror(page); 746 nfs_set_pageerror(page);
785 else
786 __set_page_dirty_nobuffers(page);
787 747
788 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", 748 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
789 status, (long long)i_size_read(inode)); 749 status, (long long)i_size_read(inode));
@@ -792,13 +752,12 @@ int nfs_updatepage(struct file *file, struct page *page,
792 752
793static void nfs_writepage_release(struct nfs_page *req) 753static void nfs_writepage_release(struct nfs_page *req)
794{ 754{
755 struct page *page = req->wb_page;
795 756
796 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) { 757 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req))
797 nfs_end_page_writeback(req->wb_page);
798 nfs_inode_remove_request(req); 758 nfs_inode_remove_request(req);
799 } else
800 nfs_end_page_writeback(req->wb_page);
801 nfs_clear_page_tag_locked(req); 759 nfs_clear_page_tag_locked(req);
760 nfs_end_page_writeback(page);
802} 761}
803 762
804static int flush_task_priority(int how) 763static int flush_task_priority(int how)
@@ -822,7 +781,6 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
822 int how) 781 int how)
823{ 782{
824 struct inode *inode = req->wb_context->path.dentry->d_inode; 783 struct inode *inode = req->wb_context->path.dentry->d_inode;
825 int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
826 int priority = flush_task_priority(how); 784 int priority = flush_task_priority(how);
827 struct rpc_task *task; 785 struct rpc_task *task;
828 struct rpc_message msg = { 786 struct rpc_message msg = {
@@ -837,9 +795,10 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
837 .callback_ops = call_ops, 795 .callback_ops = call_ops,
838 .callback_data = data, 796 .callback_data = data,
839 .workqueue = nfsiod_workqueue, 797 .workqueue = nfsiod_workqueue,
840 .flags = flags, 798 .flags = RPC_TASK_ASYNC,
841 .priority = priority, 799 .priority = priority,
842 }; 800 };
801 int ret = 0;
843 802
844 /* Set up the RPC argument and reply structs 803 /* Set up the RPC argument and reply structs
845 * NB: take care not to mess about with data->commit et al. */ 804 * NB: take care not to mess about with data->commit et al. */
@@ -878,10 +837,18 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
878 (unsigned long long)data->args.offset); 837 (unsigned long long)data->args.offset);
879 838
880 task = rpc_run_task(&task_setup_data); 839 task = rpc_run_task(&task_setup_data);
881 if (IS_ERR(task)) 840 if (IS_ERR(task)) {
882 return PTR_ERR(task); 841 ret = PTR_ERR(task);
842 goto out;
843 }
844 if (how & FLUSH_SYNC) {
845 ret = rpc_wait_for_completion_task(task);
846 if (ret == 0)
847 ret = task->tk_status;
848 }
883 rpc_put_task(task); 849 rpc_put_task(task);
884 return 0; 850out:
851 return ret;
885} 852}
886 853
887/* If a nfs_flush_* function fails, it should remove reqs from @head and 854/* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -890,9 +857,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
890 */ 857 */
891static void nfs_redirty_request(struct nfs_page *req) 858static void nfs_redirty_request(struct nfs_page *req)
892{ 859{
860 struct page *page = req->wb_page;
861
893 nfs_mark_request_dirty(req); 862 nfs_mark_request_dirty(req);
894 nfs_end_page_writeback(req->wb_page);
895 nfs_clear_page_tag_locked(req); 863 nfs_clear_page_tag_locked(req);
864 nfs_end_page_writeback(page);
896} 865}
897 866
898/* 867/*
@@ -1127,16 +1096,15 @@ static void nfs_writeback_release_full(void *calldata)
1127 if (nfs_write_need_commit(data)) { 1096 if (nfs_write_need_commit(data)) {
1128 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); 1097 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
1129 nfs_mark_request_commit(req); 1098 nfs_mark_request_commit(req);
1130 nfs_end_page_writeback(page);
1131 dprintk(" marked for commit\n"); 1099 dprintk(" marked for commit\n");
1132 goto next; 1100 goto next;
1133 } 1101 }
1134 dprintk(" OK\n"); 1102 dprintk(" OK\n");
1135remove_request: 1103remove_request:
1136 nfs_end_page_writeback(page);
1137 nfs_inode_remove_request(req); 1104 nfs_inode_remove_request(req);
1138 next: 1105 next:
1139 nfs_clear_page_tag_locked(req); 1106 nfs_clear_page_tag_locked(req);
1107 nfs_end_page_writeback(page);
1140 } 1108 }
1141 nfs_writedata_release(calldata); 1109 nfs_writedata_release(calldata);
1142} 1110}
@@ -1233,7 +1201,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1233 1201
1234 1202
1235#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 1203#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
1236void nfs_commitdata_release(void *data) 1204static void nfs_commitdata_release(void *data)
1237{ 1205{
1238 struct nfs_write_data *wdata = data; 1206 struct nfs_write_data *wdata = data;
1239 1207
@@ -1250,7 +1218,6 @@ static int nfs_commit_rpcsetup(struct list_head *head,
1250{ 1218{
1251 struct nfs_page *first = nfs_list_entry(head->next); 1219 struct nfs_page *first = nfs_list_entry(head->next);
1252 struct inode *inode = first->wb_context->path.dentry->d_inode; 1220 struct inode *inode = first->wb_context->path.dentry->d_inode;
1253 int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
1254 int priority = flush_task_priority(how); 1221 int priority = flush_task_priority(how);
1255 struct rpc_task *task; 1222 struct rpc_task *task;
1256 struct rpc_message msg = { 1223 struct rpc_message msg = {
@@ -1265,7 +1232,7 @@ static int nfs_commit_rpcsetup(struct list_head *head,
1265 .callback_ops = &nfs_commit_ops, 1232 .callback_ops = &nfs_commit_ops,
1266 .callback_data = data, 1233 .callback_data = data,
1267 .workqueue = nfsiod_workqueue, 1234 .workqueue = nfsiod_workqueue,
1268 .flags = flags, 1235 .flags = RPC_TASK_ASYNC,
1269 .priority = priority, 1236 .priority = priority,
1270 }; 1237 };
1271 1238
@@ -1295,6 +1262,8 @@ static int nfs_commit_rpcsetup(struct list_head *head,
1295 task = rpc_run_task(&task_setup_data); 1262 task = rpc_run_task(&task_setup_data);
1296 if (IS_ERR(task)) 1263 if (IS_ERR(task))
1297 return PTR_ERR(task); 1264 return PTR_ERR(task);
1265 if (how & FLUSH_SYNC)
1266 rpc_wait_for_completion_task(task);
1298 rpc_put_task(task); 1267 rpc_put_task(task);
1299 return 0; 1268 return 0;
1300} 1269}
@@ -1391,7 +1360,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
1391 .rpc_release = nfs_commit_release, 1360 .rpc_release = nfs_commit_release,
1392}; 1361};
1393 1362
1394int nfs_commit_inode(struct inode *inode, int how) 1363static int nfs_commit_inode(struct inode *inode, int how)
1395{ 1364{
1396 LIST_HEAD(head); 1365 LIST_HEAD(head);
1397 int res; 1366 int res;
@@ -1406,92 +1375,51 @@ int nfs_commit_inode(struct inode *inode, int how)
1406 } 1375 }
1407 return res; 1376 return res;
1408} 1377}
1409#else
1410static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1411{
1412 return 0;
1413}
1414#endif
1415 1378
1416long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) 1379static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
1417{ 1380{
1418 struct inode *inode = mapping->host; 1381 struct nfs_inode *nfsi = NFS_I(inode);
1419 pgoff_t idx_start, idx_end; 1382 int flags = FLUSH_SYNC;
1420 unsigned int npages = 0; 1383 int ret = 0;
1421 LIST_HEAD(head); 1384
1422 int nocommit = how & FLUSH_NOCOMMIT; 1385 /* Don't commit yet if this is a non-blocking flush and there are
1423 long pages, ret; 1386 * lots of outstanding writes for this mapping.
1424 1387 */
1425 /* FIXME */ 1388 if (wbc->sync_mode == WB_SYNC_NONE &&
1426 if (wbc->range_cyclic) 1389 nfsi->ncommit <= (nfsi->npages >> 1))
1427 idx_start = 0; 1390 goto out_mark_dirty;
1428 else { 1391
1429 idx_start = wbc->range_start >> PAGE_CACHE_SHIFT; 1392 if (wbc->nonblocking || wbc->for_background)
1430 idx_end = wbc->range_end >> PAGE_CACHE_SHIFT; 1393 flags = 0;
1431 if (idx_end > idx_start) { 1394 ret = nfs_commit_inode(inode, flags);
1432 pgoff_t l_npages = 1 + idx_end - idx_start; 1395 if (ret >= 0) {
1433 npages = l_npages; 1396 if (wbc->sync_mode == WB_SYNC_NONE) {
1434 if (sizeof(npages) != sizeof(l_npages) && 1397 if (ret < wbc->nr_to_write)
1435 (pgoff_t)npages != l_npages) 1398 wbc->nr_to_write -= ret;
1436 npages = 0; 1399 else
1400 wbc->nr_to_write = 0;
1437 } 1401 }
1402 return 0;
1438 } 1403 }
1439 how &= ~FLUSH_NOCOMMIT; 1404out_mark_dirty:
1440 spin_lock(&inode->i_lock); 1405 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1441 do {
1442 ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
1443 if (ret != 0)
1444 continue;
1445 if (nocommit)
1446 break;
1447 pages = nfs_scan_commit(inode, &head, idx_start, npages);
1448 if (pages == 0)
1449 break;
1450 if (how & FLUSH_INVALIDATE) {
1451 spin_unlock(&inode->i_lock);
1452 nfs_cancel_commit_list(&head);
1453 ret = pages;
1454 spin_lock(&inode->i_lock);
1455 continue;
1456 }
1457 pages += nfs_scan_commit(inode, &head, 0, 0);
1458 spin_unlock(&inode->i_lock);
1459 ret = nfs_commit_list(inode, &head, how);
1460 spin_lock(&inode->i_lock);
1461
1462 } while (ret >= 0);
1463 spin_unlock(&inode->i_lock);
1464 return ret; 1406 return ret;
1465} 1407}
1466 1408#else
1467static int __nfs_write_mapping(struct address_space *mapping, struct writeback_control *wbc, int how) 1409static int nfs_commit_inode(struct inode *inode, int how)
1468{ 1410{
1469 int ret;
1470
1471 ret = nfs_writepages(mapping, wbc);
1472 if (ret < 0)
1473 goto out;
1474 ret = nfs_sync_mapping_wait(mapping, wbc, how);
1475 if (ret < 0)
1476 goto out;
1477 return 0; 1411 return 0;
1478out:
1479 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1480 return ret;
1481} 1412}
1482 1413
1483/* Two pass sync: first using WB_SYNC_NONE, then WB_SYNC_ALL */ 1414static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
1484static int nfs_write_mapping(struct address_space *mapping, int how)
1485{ 1415{
1486 struct writeback_control wbc = { 1416 return 0;
1487 .bdi = mapping->backing_dev_info, 1417}
1488 .sync_mode = WB_SYNC_ALL, 1418#endif
1489 .nr_to_write = LONG_MAX,
1490 .range_start = 0,
1491 .range_end = LLONG_MAX,
1492 };
1493 1419
1494 return __nfs_write_mapping(mapping, &wbc, how); 1420int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1421{
1422 return nfs_commit_unstable_pages(inode, wbc);
1495} 1423}
1496 1424
1497/* 1425/*
@@ -1499,37 +1427,26 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
1499 */ 1427 */
1500int nfs_wb_all(struct inode *inode) 1428int nfs_wb_all(struct inode *inode)
1501{ 1429{
1502 return nfs_write_mapping(inode->i_mapping, 0); 1430 struct writeback_control wbc = {
1503} 1431 .sync_mode = WB_SYNC_ALL,
1432 .nr_to_write = LONG_MAX,
1433 .range_start = 0,
1434 .range_end = LLONG_MAX,
1435 };
1504 1436
1505int nfs_wb_nocommit(struct inode *inode) 1437 return sync_inode(inode, &wbc);
1506{
1507 return nfs_write_mapping(inode->i_mapping, FLUSH_NOCOMMIT);
1508} 1438}
1509 1439
1510int nfs_wb_page_cancel(struct inode *inode, struct page *page) 1440int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1511{ 1441{
1512 struct nfs_page *req; 1442 struct nfs_page *req;
1513 loff_t range_start = page_offset(page);
1514 loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
1515 struct writeback_control wbc = {
1516 .bdi = page->mapping->backing_dev_info,
1517 .sync_mode = WB_SYNC_ALL,
1518 .nr_to_write = LONG_MAX,
1519 .range_start = range_start,
1520 .range_end = range_end,
1521 };
1522 int ret = 0; 1443 int ret = 0;
1523 1444
1524 BUG_ON(!PageLocked(page)); 1445 BUG_ON(!PageLocked(page));
1525 for (;;) { 1446 for (;;) {
1526 req = nfs_page_find_request(page); 1447 req = nfs_page_find_request(page);
1527 if (req == NULL) 1448 if (req == NULL)
1528 goto out;
1529 if (test_bit(PG_CLEAN, &req->wb_flags)) {
1530 nfs_release_request(req);
1531 break; 1449 break;
1532 }
1533 if (nfs_lock_request_dontget(req)) { 1450 if (nfs_lock_request_dontget(req)) {
1534 nfs_inode_remove_request(req); 1451 nfs_inode_remove_request(req);
1535 /* 1452 /*
@@ -1541,55 +1458,56 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1541 break; 1458 break;
1542 } 1459 }
1543 ret = nfs_wait_on_request(req); 1460 ret = nfs_wait_on_request(req);
1461 nfs_release_request(req);
1544 if (ret < 0) 1462 if (ret < 0)
1545 goto out; 1463 break;
1546 } 1464 }
1547 if (!PagePrivate(page))
1548 return 0;
1549 ret = nfs_sync_mapping_wait(page->mapping, &wbc, FLUSH_INVALIDATE);
1550out:
1551 return ret; 1465 return ret;
1552} 1466}
1553 1467
1554static int nfs_wb_page_priority(struct inode *inode, struct page *page, 1468/*
1555 int how) 1469 * Write back all requests on one page - we do this before reading it.
1470 */
1471int nfs_wb_page(struct inode *inode, struct page *page)
1556{ 1472{
1557 loff_t range_start = page_offset(page); 1473 loff_t range_start = page_offset(page);
1558 loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); 1474 loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
1559 struct writeback_control wbc = { 1475 struct writeback_control wbc = {
1560 .bdi = page->mapping->backing_dev_info,
1561 .sync_mode = WB_SYNC_ALL, 1476 .sync_mode = WB_SYNC_ALL,
1562 .nr_to_write = LONG_MAX, 1477 .nr_to_write = 0,
1563 .range_start = range_start, 1478 .range_start = range_start,
1564 .range_end = range_end, 1479 .range_end = range_end,
1565 }; 1480 };
1481 struct nfs_page *req;
1482 int need_commit;
1566 int ret; 1483 int ret;
1567 1484
1568 do { 1485 while(PagePrivate(page)) {
1569 if (clear_page_dirty_for_io(page)) { 1486 if (clear_page_dirty_for_io(page)) {
1570 ret = nfs_writepage_locked(page, &wbc); 1487 ret = nfs_writepage_locked(page, &wbc);
1571 if (ret < 0) 1488 if (ret < 0)
1572 goto out_error; 1489 goto out_error;
1573 } else if (!PagePrivate(page)) 1490 }
1491 req = nfs_find_and_lock_request(page);
1492 if (!req)
1574 break; 1493 break;
1575 ret = nfs_sync_mapping_wait(page->mapping, &wbc, how); 1494 if (IS_ERR(req)) {
1576 if (ret < 0) 1495 ret = PTR_ERR(req);
1577 goto out_error; 1496 goto out_error;
1578 } while (PagePrivate(page)); 1497 }
1498 need_commit = test_bit(PG_CLEAN, &req->wb_flags);
1499 nfs_clear_page_tag_locked(req);
1500 if (need_commit) {
1501 ret = nfs_commit_inode(inode, FLUSH_SYNC);
1502 if (ret < 0)
1503 goto out_error;
1504 }
1505 }
1579 return 0; 1506 return 0;
1580out_error: 1507out_error:
1581 __mark_inode_dirty(inode, I_DIRTY_PAGES);
1582 return ret; 1508 return ret;
1583} 1509}
1584 1510
1585/*
1586 * Write back all requests on one page - we do this before reading it.
1587 */
1588int nfs_wb_page(struct inode *inode, struct page* page)
1589{
1590 return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
1591}
1592
1593#ifdef CONFIG_MIGRATION 1511#ifdef CONFIG_MIGRATION
1594int nfs_migrate_page(struct address_space *mapping, struct page *newpage, 1512int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1595 struct page *page) 1513 struct page *page)
@@ -1597,8 +1515,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1597 struct nfs_page *req; 1515 struct nfs_page *req;
1598 int ret; 1516 int ret;
1599 1517
1600 if (PageFsCache(page)) 1518 nfs_fscache_release_page(page, GFP_KERNEL);
1601 nfs_fscache_release_page(page, GFP_KERNEL);
1602 1519
1603 req = nfs_find_and_lock_request(page); 1520 req = nfs_find_and_lock_request(page);
1604 ret = PTR_ERR(req); 1521 ret = PTR_ERR(req);
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 04133aacb1e5..fc1c52571c03 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/gfp.h>
25#include <linux/sunrpc/xdr.h> 26#include <linux/sunrpc/xdr.h>
26#include <linux/nfsacl.h> 27#include <linux/nfsacl.h>
27#include <linux/nfs3.h> 28#include <linux/nfs3.h>
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index d3854d94b7cf..bf9cbd242ddd 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -36,10 +36,9 @@ static struct file *do_open(char *name, int flags)
36 return ERR_PTR(error); 36 return ERR_PTR(error);
37 37
38 if (flags == O_RDWR) 38 if (flags == O_RDWR)
39 error = may_open(&nd.path, MAY_READ|MAY_WRITE, 39 error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags);
40 FMODE_READ|FMODE_WRITE);
41 else 40 else
42 error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE); 41 error = may_open(&nd.path, MAY_WRITE, flags);
43 42
44 if (!error) 43 if (!error)
45 return dentry_open(nd.path.dentry, nd.path.mnt, flags, 44 return dentry_open(nd.path.dentry, nd.path.mnt, flags,
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c487810a2366..872a5ef550c7 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -12,6 +12,7 @@
12 * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de> 12 * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de>
13 */ 13 */
14 14
15#include <linux/slab.h>
15#include <linux/namei.h> 16#include <linux/namei.h>
16#include <linux/module.h> 17#include <linux/module.h>
17#include <linux/exportfs.h> 18#include <linux/exportfs.h>
@@ -1316,19 +1317,11 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
1316 1317
1317static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp) 1318static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp)
1318{ 1319{
1319 struct svc_export *exp;
1320 u32 fsidv[2]; 1320 u32 fsidv[2];
1321 1321
1322 mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); 1322 mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
1323 1323
1324 exp = rqst_exp_find(rqstp, FSID_NUM, fsidv); 1324 return rqst_exp_find(rqstp, FSID_NUM, fsidv);
1325 /*
1326 * We shouldn't have accepting an nfsv4 request at all if we
1327 * don't have a pseudoexport!:
1328 */
1329 if (IS_ERR(exp) && PTR_ERR(exp) == -ENOENT)
1330 exp = ERR_PTR(-ESERVERFAULT);
1331 return exp;
1332} 1325}
1333 1326
1334/* 1327/*
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index f20589d2ae27..6aa5590c3679 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -7,6 +7,7 @@
7#include "nfsd.h" 7#include "nfsd.h"
8/* FIXME: nfsacl.h is a broken header */ 8/* FIXME: nfsacl.h is a broken header */
9#include <linux/nfsacl.h> 9#include <linux/nfsacl.h>
10#include <linux/gfp.h>
10#include "cache.h" 11#include "cache.h"
11#include "xdr3.h" 12#include "xdr3.h"
12#include "vfs.h" 13#include "vfs.h"
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index e0c4846bad92..a596e9d987e4 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -7,6 +7,7 @@
7#include "nfsd.h" 7#include "nfsd.h"
8/* FIXME: nfsacl.h is a broken header */ 8/* FIXME: nfsacl.h is a broken header */
9#include <linux/nfsacl.h> 9#include <linux/nfsacl.h>
10#include <linux/gfp.h>
10#include "cache.h" 11#include "cache.h"
11#include "xdr3.h" 12#include "xdr3.h"
12#include "vfs.h" 13#include "vfs.h"
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 88150685df34..e48052615159 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -34,6 +34,7 @@
34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 */ 35 */
36 36
37#include <linux/slab.h>
37#include <linux/nfs_fs.h> 38#include <linux/nfs_fs.h>
38#include <linux/nfs4_acl.h> 39#include <linux/nfs4_acl.h>
39 40
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c6eed2a3b093..7e32bd394e86 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -32,6 +32,7 @@
32 */ 32 */
33 33
34#include <linux/sunrpc/clnt.h> 34#include <linux/sunrpc/clnt.h>
35#include <linux/slab.h>
35#include "nfsd.h" 36#include "nfsd.h"
36#include "state.h" 37#include "state.h"
37 38
@@ -525,6 +526,8 @@ static struct rpc_cred *callback_cred;
525 526
526int set_callback_cred(void) 527int set_callback_cred(void)
527{ 528{
529 if (callback_cred)
530 return 0;
528 callback_cred = rpc_lookup_machine_cred(); 531 callback_cred = rpc_lookup_machine_cred();
529 if (!callback_cred) 532 if (!callback_cred)
530 return -ENOMEM; 533 return -ENOMEM;
@@ -542,7 +545,8 @@ void do_probe_callback(struct nfs4_client *clp)
542 }; 545 };
543 int status; 546 int status;
544 547
545 status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT, 548 status = rpc_call_async(cb->cb_client, &msg,
549 RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
546 &nfsd4_cb_probe_ops, (void *)clp); 550 &nfsd4_cb_probe_ops, (void *)clp);
547 if (status) { 551 if (status) {
548 warn_no_callback_path(clp, status); 552 warn_no_callback_path(clp, status);
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 6e2983b27f3c..c78dbf493424 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -36,6 +36,7 @@
36#include <linux/nfsd_idmap.h> 36#include <linux/nfsd_idmap.h>
37#include <linux/seq_file.h> 37#include <linux/seq_file.h>
38#include <linux/sched.h> 38#include <linux/sched.h>
39#include <linux/slab.h>
39 40
40/* 41/*
41 * Cache entry 42 * Cache entry
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 37514c469846..2ab9e8501bfe 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -33,6 +33,7 @@
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */ 34 */
35#include <linux/file.h> 35#include <linux/file.h>
36#include <linux/slab.h>
36 37
37#include "cache.h" 38#include "cache.h"
38#include "xdr4.h" 39#include "xdr4.h"
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 5a754f7b71ed..7a9ae3254a4b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -32,6 +32,7 @@
32*/ 32*/
33 33
34#include <linux/file.h> 34#include <linux/file.h>
35#include <linux/slab.h>
35#include <linux/namei.h> 36#include <linux/namei.h>
36#include <linux/crypto.h> 37#include <linux/crypto.h>
37#include <linux/sched.h> 38#include <linux/sched.h>
@@ -119,9 +120,7 @@ out_no_tfm:
119static void 120static void
120nfsd4_sync_rec_dir(void) 121nfsd4_sync_rec_dir(void)
121{ 122{
122 mutex_lock(&rec_dir.dentry->d_inode->i_mutex); 123 vfs_fsync(NULL, rec_dir.dentry, 0);
123 nfsd_sync_dir(rec_dir.dentry);
124 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
125} 124}
126 125
127int 126int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f19ed866c95f..6a8fedaa4f55 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -34,6 +34,7 @@
34 34
35#include <linux/file.h> 35#include <linux/file.h>
36#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
37#include <linux/slab.h>
37#include <linux/namei.h> 38#include <linux/namei.h>
38#include <linux/swap.h> 39#include <linux/swap.h>
39#include <linux/sunrpc/svcauth_gss.h> 40#include <linux/sunrpc/svcauth_gss.h>
@@ -1998,7 +1999,9 @@ nfs4_file_downgrade(struct file *filp, unsigned int share_access)
1998{ 1999{
1999 if (share_access & NFS4_SHARE_ACCESS_WRITE) { 2000 if (share_access & NFS4_SHARE_ACCESS_WRITE) {
2000 drop_file_write_access(filp); 2001 drop_file_write_access(filp);
2002 spin_lock(&filp->f_lock);
2001 filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE; 2003 filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE;
2004 spin_unlock(&filp->f_lock);
2002 } 2005 }
2003} 2006}
2004 2007
@@ -2480,8 +2483,10 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
2480 } 2483 }
2481 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); 2484 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
2482 2485
2483 if (nfsd4_has_session(&resp->cstate)) 2486 if (nfsd4_has_session(&resp->cstate)) {
2484 open->op_stateowner->so_confirmed = 1; 2487 open->op_stateowner->so_confirmed = 1;
2488 nfsd4_create_clid_dir(open->op_stateowner->so_client);
2489 }
2485 2490
2486 /* 2491 /*
2487 * Attempt to hand out a delegation. No error return, because the 2492 * Attempt to hand out a delegation. No error return, because the
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index a8587e90fd5a..34ccf815ea8a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -40,6 +40,7 @@
40 * at the end of nfs4svc_decode_compoundargs. 40 * at the end of nfs4svc_decode_compoundargs.
41 */ 41 */
42 42
43#include <linux/slab.h>
43#include <linux/namei.h> 44#include <linux/namei.h>
44#include <linux/statfs.h> 45#include <linux/statfs.h>
45#include <linux/utsname.h> 46#include <linux/utsname.h>
@@ -160,10 +161,10 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
160 argp->p = page_address(argp->pagelist[0]); 161 argp->p = page_address(argp->pagelist[0]);
161 argp->pagelist++; 162 argp->pagelist++;
162 if (argp->pagelen < PAGE_SIZE) { 163 if (argp->pagelen < PAGE_SIZE) {
163 argp->end = p + (argp->pagelen>>2); 164 argp->end = argp->p + (argp->pagelen>>2);
164 argp->pagelen = 0; 165 argp->pagelen = 0;
165 } else { 166 } else {
166 argp->end = p + (PAGE_SIZE>>2); 167 argp->end = argp->p + (PAGE_SIZE>>2);
167 argp->pagelen -= PAGE_SIZE; 168 argp->pagelen -= PAGE_SIZE;
168 } 169 }
169 memcpy(((char*)p)+avail, argp->p, (nbytes - avail)); 170 memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
@@ -1425,16 +1426,16 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1425 argp->p = page_address(argp->pagelist[0]); 1426 argp->p = page_address(argp->pagelist[0]);
1426 argp->pagelist++; 1427 argp->pagelist++;
1427 if (argp->pagelen < PAGE_SIZE) { 1428 if (argp->pagelen < PAGE_SIZE) {
1428 argp->end = p + (argp->pagelen>>2); 1429 argp->end = argp->p + (argp->pagelen>>2);
1429 argp->pagelen = 0; 1430 argp->pagelen = 0;
1430 } else { 1431 } else {
1431 argp->end = p + (PAGE_SIZE>>2); 1432 argp->end = argp->p + (PAGE_SIZE>>2);
1432 argp->pagelen -= PAGE_SIZE; 1433 argp->pagelen -= PAGE_SIZE;
1433 } 1434 }
1434 } 1435 }
1435 op->opnum = ntohl(*argp->p++); 1436 op->opnum = ntohl(*argp->p++);
1436 1437
1437 if (op->opnum >= OP_ACCESS && op->opnum < ops->nops) 1438 if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP)
1438 op->status = ops->decoders[op->opnum](argp, &op->u); 1439 op->status = ops->decoders[op->opnum](argp, &op->u);
1439 else { 1440 else {
1440 op->opnum = OP_ILLEGAL; 1441 op->opnum = OP_ILLEGAL;
@@ -1528,7 +1529,7 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
1528 } } while (0); 1529 } } while (0);
1529 1530
1530/* Encode as an array of strings the string given with components 1531/* Encode as an array of strings the string given with components
1531 * seperated @sep. 1532 * separated @sep.
1532 */ 1533 */
1533static __be32 nfsd4_encode_components(char sep, char *components, 1534static __be32 nfsd4_encode_components(char sep, char *components,
1534 __be32 **pp, int *buflen) 1535 __be32 **pp, int *buflen)
@@ -2121,9 +2122,15 @@ out_acl:
2121 * and this is the root of a cross-mounted filesystem. 2122 * and this is the root of a cross-mounted filesystem.
2122 */ 2123 */
2123 if (ignore_crossmnt == 0 && 2124 if (ignore_crossmnt == 0 &&
2124 exp->ex_path.mnt->mnt_root->d_inode == dentry->d_inode) { 2125 dentry == exp->ex_path.mnt->mnt_root) {
2125 err = vfs_getattr(exp->ex_path.mnt->mnt_parent, 2126 struct path path = exp->ex_path;
2126 exp->ex_path.mnt->mnt_mountpoint, &stat); 2127 path_get(&path);
2128 while (follow_up(&path)) {
2129 if (path.dentry != path.mnt->mnt_root)
2130 break;
2131 }
2132 err = vfs_getattr(path.mnt, path.dentry, &stat);
2133 path_put(&path);
2127 if (err) 2134 if (err)
2128 goto out_nfserr; 2135 goto out_nfserr;
2129 } 2136 }
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index da08560c4818..4666a209678a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -8,6 +8,8 @@
8 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 8 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
9 */ 9 */
10 10
11#include <linux/slab.h>
12
11#include "nfsd.h" 13#include "nfsd.h"
12#include "cache.h" 14#include "cache.h"
13 15
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 2604c3e70ea5..e3591073098f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -4,6 +4,7 @@
4 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 4 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
5 */ 5 */
6 6
7#include <linux/slab.h>
7#include <linux/namei.h> 8#include <linux/namei.h>
8#include <linux/ctype.h> 9#include <linux/ctype.h>
9 10
@@ -988,6 +989,7 @@ static ssize_t __write_ports_delfd(char *buf)
988static ssize_t __write_ports_addxprt(char *buf) 989static ssize_t __write_ports_addxprt(char *buf)
989{ 990{
990 char transport[16]; 991 char transport[16];
992 struct svc_xprt *xprt;
991 int port, err; 993 int port, err;
992 994
993 if (sscanf(buf, "%15s %4u", transport, &port) != 2) 995 if (sscanf(buf, "%15s %4u", transport, &port) != 2)
@@ -1002,13 +1004,24 @@ static ssize_t __write_ports_addxprt(char *buf)
1002 1004
1003 err = svc_create_xprt(nfsd_serv, transport, 1005 err = svc_create_xprt(nfsd_serv, transport,
1004 PF_INET, port, SVC_SOCK_ANONYMOUS); 1006 PF_INET, port, SVC_SOCK_ANONYMOUS);
1005 if (err < 0) { 1007 if (err < 0)
1006 /* Give a reasonable perror msg for bad transport string */ 1008 goto out_err;
1007 if (err == -ENOENT) 1009
1008 err = -EPROTONOSUPPORT; 1010 err = svc_create_xprt(nfsd_serv, transport,
1009 return err; 1011 PF_INET6, port, SVC_SOCK_ANONYMOUS);
1010 } 1012 if (err < 0 && err != -EAFNOSUPPORT)
1013 goto out_close;
1011 return 0; 1014 return 0;
1015out_close:
1016 xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port);
1017 if (xprt != NULL) {
1018 svc_close_xprt(xprt);
1019 svc_xprt_put(xprt);
1020 }
1021out_err:
1022 /* Decrease the count, but don't shut down the service */
1023 nfsd_serv->sv_nrthreads--;
1024 return err;
1012} 1025}
1013 1026
1014/* 1027/*
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7c2e337d05af..6dd5f1970e01 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -20,13 +20,15 @@
20#include <linux/fcntl.h> 20#include <linux/fcntl.h>
21#include <linux/namei.h> 21#include <linux/namei.h>
22#include <linux/delay.h> 22#include <linux/delay.h>
23#include <linux/quotaops.h>
24#include <linux/fsnotify.h> 23#include <linux/fsnotify.h>
25#include <linux/posix_acl_xattr.h> 24#include <linux/posix_acl_xattr.h>
26#include <linux/xattr.h> 25#include <linux/xattr.h>
27#include <linux/jhash.h> 26#include <linux/jhash.h>
28#include <linux/ima.h> 27#include <linux/ima.h>
28#include <linux/slab.h>
29#include <asm/uaccess.h> 29#include <asm/uaccess.h>
30#include <linux/exportfs.h>
31#include <linux/writeback.h>
30 32
31#ifdef CONFIG_NFSD_V3 33#ifdef CONFIG_NFSD_V3
32#include "xdr3.h" 34#include "xdr3.h"
@@ -271,6 +273,32 @@ out:
271 return err; 273 return err;
272} 274}
273 275
276/*
277 * Commit metadata changes to stable storage.
278 */
279static int
280commit_metadata(struct svc_fh *fhp)
281{
282 struct inode *inode = fhp->fh_dentry->d_inode;
283 const struct export_operations *export_ops = inode->i_sb->s_export_op;
284 int error = 0;
285
286 if (!EX_ISSYNC(fhp->fh_export))
287 return 0;
288
289 if (export_ops->commit_metadata) {
290 error = export_ops->commit_metadata(inode);
291 } else {
292 struct writeback_control wbc = {
293 .sync_mode = WB_SYNC_ALL,
294 .nr_to_write = 0, /* metadata only */
295 };
296
297 error = sync_inode(inode, &wbc);
298 }
299
300 return error;
301}
274 302
275/* 303/*
276 * Set various file attributes. 304 * Set various file attributes.
@@ -361,7 +389,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
361 * If we are changing the size of the file, then 389 * If we are changing the size of the file, then
362 * we need to break all leases. 390 * we need to break all leases.
363 */ 391 */
364 host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK); 392 host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
365 if (host_err == -EWOULDBLOCK) 393 if (host_err == -EWOULDBLOCK)
366 host_err = -ETIMEDOUT; 394 host_err = -ETIMEDOUT;
367 if (host_err) /* ENOMEM or EWOULDBLOCK */ 395 if (host_err) /* ENOMEM or EWOULDBLOCK */
@@ -377,7 +405,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
377 put_write_access(inode); 405 put_write_access(inode);
378 goto out_nfserr; 406 goto out_nfserr;
379 } 407 }
380 vfs_dq_init(inode);
381 } 408 }
382 409
383 /* sanitize the mode change */ 410 /* sanitize the mode change */
@@ -734,7 +761,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
734 * Check to see if there are any leases on this file. 761 * Check to see if there are any leases on this file.
735 * This may block while leases are broken. 762 * This may block while leases are broken.
736 */ 763 */
737 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? FMODE_WRITE : 0)); 764 host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
738 if (host_err == -EWOULDBLOCK) 765 if (host_err == -EWOULDBLOCK)
739 host_err = -ETIMEDOUT; 766 host_err = -ETIMEDOUT;
740 if (host_err) /* NOMEM or WOULDBLOCK */ 767 if (host_err) /* NOMEM or WOULDBLOCK */
@@ -745,13 +772,13 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
745 flags = O_RDWR|O_LARGEFILE; 772 flags = O_RDWR|O_LARGEFILE;
746 else 773 else
747 flags = O_WRONLY|O_LARGEFILE; 774 flags = O_WRONLY|O_LARGEFILE;
748
749 vfs_dq_init(inode);
750 } 775 }
751 *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt), 776 *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
752 flags, current_cred()); 777 flags, current_cred());
753 if (IS_ERR(*filp)) 778 if (IS_ERR(*filp))
754 host_err = PTR_ERR(*filp); 779 host_err = PTR_ERR(*filp);
780 else
781 host_err = ima_file_check(*filp, access);
755out_nfserr: 782out_nfserr:
756 err = nfserrno(host_err); 783 err = nfserrno(host_err);
757out: 784out:
@@ -769,46 +796,6 @@ nfsd_close(struct file *filp)
769} 796}
770 797
771/* 798/*
772 * Sync a file
773 * As this calls fsync (not fdatasync) there is no need for a write_inode
774 * after it.
775 */
776static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
777 const struct file_operations *fop)
778{
779 struct inode *inode = dp->d_inode;
780 int (*fsync) (struct file *, struct dentry *, int);
781 int err;
782
783 err = filemap_fdatawrite(inode->i_mapping);
784 if (err == 0 && fop && (fsync = fop->fsync))
785 err = fsync(filp, dp, 0);
786 if (err == 0)
787 err = filemap_fdatawait(inode->i_mapping);
788
789 return err;
790}
791
792static int
793nfsd_sync(struct file *filp)
794{
795 int err;
796 struct inode *inode = filp->f_path.dentry->d_inode;
797 dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);
798 mutex_lock(&inode->i_mutex);
799 err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);
800 mutex_unlock(&inode->i_mutex);
801
802 return err;
803}
804
805int
806nfsd_sync_dir(struct dentry *dp)
807{
808 return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
809}
810
811/*
812 * Obtain the readahead parameters for the file 799 * Obtain the readahead parameters for the file
813 * specified by (dev, ino). 800 * specified by (dev, ino).
814 */ 801 */
@@ -1011,7 +998,7 @@ static int wait_for_concurrent_writes(struct file *file)
1011 998
1012 if (inode->i_state & I_DIRTY) { 999 if (inode->i_state & I_DIRTY) {
1013 dprintk("nfsd: write sync %d\n", task_pid_nr(current)); 1000 dprintk("nfsd: write sync %d\n", task_pid_nr(current));
1014 err = nfsd_sync(file); 1001 err = vfs_fsync(file, file->f_path.dentry, 0);
1015 } 1002 }
1016 last_ino = inode->i_ino; 1003 last_ino = inode->i_ino;
1017 last_dev = inode->i_sb->s_dev; 1004 last_dev = inode->i_sb->s_dev;
@@ -1159,8 +1146,9 @@ out:
1159#ifdef CONFIG_NFSD_V3 1146#ifdef CONFIG_NFSD_V3
1160/* 1147/*
1161 * Commit all pending writes to stable storage. 1148 * Commit all pending writes to stable storage.
1162 * Strictly speaking, we could sync just the indicated file region here, 1149 *
1163 * but there's currently no way we can ask the VFS to do so. 1150 * Note: we only guarantee that data that lies within the range specified
1151 * by the 'offset' and 'count' parameters will be synced.
1164 * 1152 *
1165 * Unfortunately we cannot lock the file to make sure we return full WCC 1153 * Unfortunately we cannot lock the file to make sure we return full WCC
1166 * data to the client, as locking happens lower down in the filesystem. 1154 * data to the client, as locking happens lower down in the filesystem.
@@ -1170,23 +1158,32 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
1170 loff_t offset, unsigned long count) 1158 loff_t offset, unsigned long count)
1171{ 1159{
1172 struct file *file; 1160 struct file *file;
1173 __be32 err; 1161 loff_t end = LLONG_MAX;
1162 __be32 err = nfserr_inval;
1174 1163
1175 if ((u64)count > ~(u64)offset) 1164 if (offset < 0)
1176 return nfserr_inval; 1165 goto out;
1166 if (count != 0) {
1167 end = offset + (loff_t)count - 1;
1168 if (end < offset)
1169 goto out;
1170 }
1177 1171
1178 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); 1172 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
1179 if (err) 1173 if (err)
1180 return err; 1174 goto out;
1181 if (EX_ISSYNC(fhp->fh_export)) { 1175 if (EX_ISSYNC(fhp->fh_export)) {
1182 if (file->f_op && file->f_op->fsync) { 1176 int err2 = vfs_fsync_range(file, file->f_path.dentry,
1183 err = nfserrno(nfsd_sync(file)); 1177 offset, end, 0);
1184 } else { 1178
1179 if (err2 != -EINVAL)
1180 err = nfserrno(err2);
1181 else
1185 err = nfserr_notsupp; 1182 err = nfserr_notsupp;
1186 }
1187 } 1183 }
1188 1184
1189 nfsd_close(file); 1185 nfsd_close(file);
1186out:
1190 return err; 1187 return err;
1191} 1188}
1192#endif /* CONFIG_NFSD_V3 */ 1189#endif /* CONFIG_NFSD_V3 */
@@ -1339,12 +1336,14 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1339 goto out_nfserr; 1336 goto out_nfserr;
1340 } 1337 }
1341 1338
1342 if (EX_ISSYNC(fhp->fh_export)) { 1339 err = nfsd_create_setattr(rqstp, resfhp, iap);
1343 err = nfserrno(nfsd_sync_dir(dentry));
1344 write_inode_now(dchild->d_inode, 1);
1345 }
1346 1340
1347 err2 = nfsd_create_setattr(rqstp, resfhp, iap); 1341 /*
1342 * nfsd_setattr already committed the child. Transactional filesystems
1343 * had a chance to commit changes for both parent and child
1344 * simultaneously making the following commit_metadata a noop.
1345 */
1346 err2 = nfserrno(commit_metadata(fhp));
1348 if (err2) 1347 if (err2)
1349 err = err2; 1348 err = err2;
1350 mnt_drop_write(fhp->fh_export->ex_path.mnt); 1349 mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1376,7 +1375,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1376 struct dentry *dentry, *dchild = NULL; 1375 struct dentry *dentry, *dchild = NULL;
1377 struct inode *dirp; 1376 struct inode *dirp;
1378 __be32 err; 1377 __be32 err;
1379 __be32 err2;
1380 int host_err; 1378 int host_err;
1381 __u32 v_mtime=0, v_atime=0; 1379 __u32 v_mtime=0, v_atime=0;
1382 1380
@@ -1471,11 +1469,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1471 if (created) 1469 if (created)
1472 *created = 1; 1470 *created = 1;
1473 1471
1474 if (EX_ISSYNC(fhp->fh_export)) {
1475 err = nfserrno(nfsd_sync_dir(dentry));
1476 /* setattr will sync the child (or not) */
1477 }
1478
1479 nfsd_check_ignore_resizing(iap); 1472 nfsd_check_ignore_resizing(iap);
1480 1473
1481 if (createmode == NFS3_CREATE_EXCLUSIVE) { 1474 if (createmode == NFS3_CREATE_EXCLUSIVE) {
@@ -1490,9 +1483,13 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1490 } 1483 }
1491 1484
1492 set_attr: 1485 set_attr:
1493 err2 = nfsd_create_setattr(rqstp, resfhp, iap); 1486 err = nfsd_create_setattr(rqstp, resfhp, iap);
1494 if (err2) 1487
1495 err = err2; 1488 /*
1489 * nfsd_setattr already committed the child (and possibly also the parent).
1490 */
1491 if (!err)
1492 err = nfserrno(commit_metadata(fhp));
1496 1493
1497 mnt_drop_write(fhp->fh_export->ex_path.mnt); 1494 mnt_drop_write(fhp->fh_export->ex_path.mnt);
1498 /* 1495 /*
@@ -1607,12 +1604,9 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
1607 } 1604 }
1608 } else 1605 } else
1609 host_err = vfs_symlink(dentry->d_inode, dnew, path); 1606 host_err = vfs_symlink(dentry->d_inode, dnew, path);
1610
1611 if (!host_err) {
1612 if (EX_ISSYNC(fhp->fh_export))
1613 host_err = nfsd_sync_dir(dentry);
1614 }
1615 err = nfserrno(host_err); 1607 err = nfserrno(host_err);
1608 if (!err)
1609 err = nfserrno(commit_metadata(fhp));
1616 fh_unlock(fhp); 1610 fh_unlock(fhp);
1617 1611
1618 mnt_drop_write(fhp->fh_export->ex_path.mnt); 1612 mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1674,11 +1668,9 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1674 } 1668 }
1675 host_err = vfs_link(dold, dirp, dnew); 1669 host_err = vfs_link(dold, dirp, dnew);
1676 if (!host_err) { 1670 if (!host_err) {
1677 if (EX_ISSYNC(ffhp->fh_export)) { 1671 err = nfserrno(commit_metadata(ffhp));
1678 err = nfserrno(nfsd_sync_dir(ddir)); 1672 if (!err)
1679 write_inode_now(dest, 1); 1673 err = nfserrno(commit_metadata(tfhp));
1680 }
1681 err = 0;
1682 } else { 1674 } else {
1683 if (host_err == -EXDEV && rqstp->rq_vers == 2) 1675 if (host_err == -EXDEV && rqstp->rq_vers == 2)
1684 err = nfserr_acces; 1676 err = nfserr_acces;
@@ -1774,10 +1766,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1774 goto out_dput_new; 1766 goto out_dput_new;
1775 1767
1776 host_err = vfs_rename(fdir, odentry, tdir, ndentry); 1768 host_err = vfs_rename(fdir, odentry, tdir, ndentry);
1777 if (!host_err && EX_ISSYNC(tfhp->fh_export)) { 1769 if (!host_err) {
1778 host_err = nfsd_sync_dir(tdentry); 1770 host_err = commit_metadata(tfhp);
1779 if (!host_err) 1771 if (!host_err)
1780 host_err = nfsd_sync_dir(fdentry); 1772 host_err = commit_metadata(ffhp);
1781 } 1773 }
1782 1774
1783 mnt_drop_write(ffhp->fh_export->ex_path.mnt); 1775 mnt_drop_write(ffhp->fh_export->ex_path.mnt);
@@ -1858,12 +1850,9 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1858 1850
1859 dput(rdentry); 1851 dput(rdentry);
1860 1852
1861 if (host_err) 1853 if (!host_err)
1862 goto out_drop; 1854 host_err = commit_metadata(fhp);
1863 if (EX_ISSYNC(fhp->fh_export))
1864 host_err = nfsd_sync_dir(dentry);
1865 1855
1866out_drop:
1867 mnt_drop_write(fhp->fh_export->ex_path.mnt); 1856 mnt_drop_write(fhp->fh_export->ex_path.mnt);
1868out_nfserr: 1857out_nfserr:
1869 err = nfserrno(host_err); 1858 err = nfserrno(host_err);
@@ -2130,7 +2119,6 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2130 */ 2119 */
2131 path.mnt = exp->ex_path.mnt; 2120 path.mnt = exp->ex_path.mnt;
2132 path.dentry = dentry; 2121 path.dentry = dentry;
2133 err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC));
2134nfsd_out: 2122nfsd_out:
2135 return err? nfserrno(err) : 0; 2123 return err? nfserrno(err) : 0;
2136} 2124}
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 3f959f1879d8..7cfb87e692da 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -26,6 +26,7 @@
26#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/bitops.h> 28#include <linux/bitops.h>
29#include <linux/slab.h>
29#include "mdt.h" 30#include "mdt.h"
30#include "alloc.h" 31#include "alloc.h"
31 32
@@ -425,7 +426,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
425 bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); 426 bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
426 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), 427 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
427 group_offset, bitmap)) 428 group_offset, bitmap))
428 printk(KERN_WARNING "%s: entry numer %llu already freed\n", 429 printk(KERN_WARNING "%s: entry number %llu already freed\n",
429 __func__, (unsigned long long)req->pr_entry_nr); 430 __func__, (unsigned long long)req->pr_entry_nr);
430 431
431 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); 432 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index f4543ac4f560..5cccf874d692 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -42,7 +42,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
42 const struct buffer_head *, void *); 42 const struct buffer_head *, void *);
43 43
44/** 44/**
45 * nilfs_palloc_req - persistent alloctor request and reply 45 * nilfs_palloc_req - persistent allocator request and reply
46 * @pr_entry_nr: entry number (vblocknr or inode number) 46 * @pr_entry_nr: entry number (vblocknr or inode number)
47 * @pr_desc_bh: buffer head of the buffer containing block group descriptors 47 * @pr_desc_bh: buffer head of the buffer containing block group descriptors
48 * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap 48 * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index f4a14ea2ed9c..effdbdbe6c11 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -417,8 +417,8 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
417 417
418 key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT - 418 key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT -
419 bmap->b_inode->i_blkbits); 419 bmap->b_inode->i_blkbits);
420 for (pbh = page_buffers(bh->b_page); pbh != bh; 420 for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page)
421 pbh = pbh->b_this_page, key++); 421 key++;
422 422
423 return key; 423 return key;
424} 424}
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 471e269536ae..447ce47a3306 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -27,6 +27,7 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
30#include <linux/gfp.h>
30#include "nilfs.h" 31#include "nilfs.h"
31#include "mdt.h" 32#include "mdt.h"
32#include "dat.h" 33#include "dat.h"
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 7cdd98b8d514..76c38e3e19d2 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1879,7 +1879,7 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1879 struct nilfs_btree_path *path, 1879 struct nilfs_btree_path *path,
1880 int level, struct buffer_head *bh) 1880 int level, struct buffer_head *bh)
1881{ 1881{
1882 int maxlevel, ret; 1882 int maxlevel = 0, ret;
1883 struct nilfs_btree_node *parent; 1883 struct nilfs_btree_node *parent;
1884 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); 1884 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
1885 __u64 ptr; 1885 __u64 ptr;
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index d5ad54e204a5..18737818db63 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -328,19 +328,24 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
328 tnicps += nicps; 328 tnicps += nicps;
329 nilfs_mdt_mark_buffer_dirty(cp_bh); 329 nilfs_mdt_mark_buffer_dirty(cp_bh);
330 nilfs_mdt_mark_dirty(cpfile); 330 nilfs_mdt_mark_dirty(cpfile);
331 if (!nilfs_cpfile_is_in_first(cpfile, cno) && 331 if (!nilfs_cpfile_is_in_first(cpfile, cno)) {
332 (count = nilfs_cpfile_block_sub_valid_checkpoints( 332 count =
333 cpfile, cp_bh, kaddr, nicps)) == 0) { 333 nilfs_cpfile_block_sub_valid_checkpoints(
334 /* make hole */ 334 cpfile, cp_bh, kaddr, nicps);
335 kunmap_atomic(kaddr, KM_USER0); 335 if (count == 0) {
336 brelse(cp_bh); 336 /* make hole */
337 ret = nilfs_cpfile_delete_checkpoint_block( 337 kunmap_atomic(kaddr, KM_USER0);
338 cpfile, cno); 338 brelse(cp_bh);
339 if (ret == 0) 339 ret =
340 continue; 340 nilfs_cpfile_delete_checkpoint_block(
341 printk(KERN_ERR "%s: cannot delete block\n", 341 cpfile, cno);
342 __func__); 342 if (ret == 0)
343 break; 343 continue;
344 printk(KERN_ERR
345 "%s: cannot delete block\n",
346 __func__);
347 break;
348 }
344 } 349 }
345 } 350 }
346 351
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 187dd07ba86c..013146755683 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -288,7 +288,7 @@ int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
288 * @vblocknrs and @nitems. 288 * @vblocknrs and @nitems.
289 * 289 *
290 * Return Value: On success, 0 is returned. On error, one of the following 290 * Return Value: On success, 0 is returned. On error, one of the following
291 * nagative error codes is returned. 291 * negative error codes is returned.
292 * 292 *
293 * %-EIO - I/O error. 293 * %-EIO - I/O error.
294 * 294 *
@@ -388,8 +388,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
388 ret = -ENOENT; 388 ret = -ENOENT;
389 goto out; 389 goto out;
390 } 390 }
391 if (blocknrp != NULL) 391 *blocknrp = blocknr;
392 *blocknrp = blocknr;
393 392
394 out: 393 out:
395 kunmap_atomic(kaddr, KM_USER0); 394 kunmap_atomic(kaddr, KM_USER0);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 76d803e060a9..85c89dfc71f0 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -224,7 +224,7 @@ fail:
224 * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller. 224 * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller.
225 */ 225 */
226static int 226static int
227nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de) 227nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de)
228{ 228{
229 if (len != de->name_len) 229 if (len != de->name_len)
230 return 0; 230 return 0;
@@ -349,11 +349,11 @@ done:
349 * Entry is guaranteed to be valid. 349 * Entry is guaranteed to be valid.
350 */ 350 */
351struct nilfs_dir_entry * 351struct nilfs_dir_entry *
352nilfs_find_entry(struct inode *dir, struct dentry *dentry, 352nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
353 struct page **res_page) 353 struct page **res_page)
354{ 354{
355 const char *name = dentry->d_name.name; 355 const unsigned char *name = qstr->name;
356 int namelen = dentry->d_name.len; 356 int namelen = qstr->len;
357 unsigned reclen = NILFS_DIR_REC_LEN(namelen); 357 unsigned reclen = NILFS_DIR_REC_LEN(namelen);
358 unsigned long start, n; 358 unsigned long start, n;
359 unsigned long npages = dir_pages(dir); 359 unsigned long npages = dir_pages(dir);
@@ -396,7 +396,7 @@ nilfs_find_entry(struct inode *dir, struct dentry *dentry,
396 /* next page is past the blocks we've got */ 396 /* next page is past the blocks we've got */
397 if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) { 397 if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
398 nilfs_error(dir->i_sb, __func__, 398 nilfs_error(dir->i_sb, __func__,
399 "dir %lu size %lld exceeds block cout %llu", 399 "dir %lu size %lld exceeds block count %llu",
400 dir->i_ino, dir->i_size, 400 dir->i_ino, dir->i_size,
401 (unsigned long long)dir->i_blocks); 401 (unsigned long long)dir->i_blocks);
402 goto out; 402 goto out;
@@ -424,13 +424,13 @@ struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
424 return de; 424 return de;
425} 425}
426 426
427ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry) 427ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
428{ 428{
429 ino_t res = 0; 429 ino_t res = 0;
430 struct nilfs_dir_entry *de; 430 struct nilfs_dir_entry *de;
431 struct page *page; 431 struct page *page;
432 432
433 de = nilfs_find_entry(dir, dentry, &page); 433 de = nilfs_find_entry(dir, qstr, &page);
434 if (de) { 434 if (de) {
435 res = le64_to_cpu(de->inode); 435 res = le64_to_cpu(de->inode);
436 kunmap(page); 436 kunmap(page);
@@ -465,7 +465,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
465int nilfs_add_link(struct dentry *dentry, struct inode *inode) 465int nilfs_add_link(struct dentry *dentry, struct inode *inode)
466{ 466{
467 struct inode *dir = dentry->d_parent->d_inode; 467 struct inode *dir = dentry->d_parent->d_inode;
468 const char *name = dentry->d_name.name; 468 const unsigned char *name = dentry->d_name.name;
469 int namelen = dentry->d_name.len; 469 int namelen = dentry->d_name.len;
470 unsigned chunk_size = nilfs_chunk_size(dir); 470 unsigned chunk_size = nilfs_chunk_size(dir);
471 unsigned reclen = NILFS_DIR_REC_LEN(namelen); 471 unsigned reclen = NILFS_DIR_REC_LEN(namelen);
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index d369ac718277..236753df5cdf 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -51,11 +51,11 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
51 struct nilfs_direct *direct; 51 struct nilfs_direct *direct;
52 __u64 ptr; 52 __u64 ptr;
53 53
54 direct = (struct nilfs_direct *)bmap; 54 direct = (struct nilfs_direct *)bmap; /* XXX: use macro for level 1 */
55 if ((key > NILFS_DIRECT_KEY_MAX) || 55 if (key > NILFS_DIRECT_KEY_MAX || level != 1)
56 (level != 1) || /* XXX: use macro for level 1 */ 56 return -ENOENT;
57 ((ptr = nilfs_direct_get_ptr(direct, key)) == 57 ptr = nilfs_direct_get_ptr(direct, key);
58 NILFS_BMAP_INVALID_PTR)) 58 if (ptr == NILFS_BMAP_INVALID_PTR)
59 return -ENOENT; 59 return -ENOENT;
60 60
61 if (ptrp != NULL) 61 if (ptrp != NULL)
@@ -73,9 +73,10 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
73 sector_t blocknr; 73 sector_t blocknr;
74 int ret, cnt; 74 int ret, cnt;
75 75
76 if (key > NILFS_DIRECT_KEY_MAX || 76 if (key > NILFS_DIRECT_KEY_MAX)
77 (ptr = nilfs_direct_get_ptr(direct, key)) == 77 return -ENOENT;
78 NILFS_BMAP_INVALID_PTR) 78 ptr = nilfs_direct_get_ptr(direct, key);
79 if (ptr == NILFS_BMAP_INVALID_PTR)
79 return -ENOENT; 80 return -ENOENT;
80 81
81 if (NILFS_BMAP_USE_VBN(bmap)) { 82 if (NILFS_BMAP_USE_VBN(bmap)) {
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index e16a6664dfa2..145f03cd7d3e 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,10 +28,10 @@
28 * gcinodes), and this file provides lookup function of the dummy 28 * gcinodes), and this file provides lookup function of the dummy
29 * inodes and their buffer read function. 29 * inodes and their buffer read function.
30 * 30 *
31 * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it 31 * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
32 * has to treat blocks that belong to a same file but have different 32 * has to treat blocks that belong to a same file but have different
33 * checkpoint numbers. To avoid interference among generations, dummy 33 * checkpoint numbers. To avoid interference among generations, dummy
34 * inodes are managed separatly from actual inodes, and their lookup 34 * inodes are managed separately from actual inodes, and their lookup
35 * function (nilfs_gc_iget) is designed to be specified with a 35 * function (nilfs_gc_iget) is designed to be specified with a
36 * checkpoint number argument as well as an inode number. 36 * checkpoint number argument as well as an inode number.
37 * 37 *
@@ -45,6 +45,7 @@
45#include <linux/buffer_head.h> 45#include <linux/buffer_head.h>
46#include <linux/mpage.h> 46#include <linux/mpage.h>
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/slab.h>
48#include <linux/swap.h> 49#include <linux/swap.h>
49#include "nilfs.h" 50#include "nilfs.h"
50#include "page.h" 51#include "page.h"
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7868cc122ac7..0957b58f909d 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/buffer_head.h> 24#include <linux/buffer_head.h>
25#include <linux/gfp.h>
25#include <linux/mpage.h> 26#include <linux/mpage.h>
26#include <linux/writeback.h> 27#include <linux/writeback.h>
27#include <linux/uio.h> 28#include <linux/uio.h>
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f6af76042d80..f90a33d9a5b0 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -23,9 +23,11 @@
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/wait.h> 24#include <linux/wait.h>
25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */ 25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */
26#include <linux/slab.h>
26#include <linux/capability.h> /* capable() */ 27#include <linux/capability.h> /* capable() */
27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ 28#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
28#include <linux/vmalloc.h> 29#include <linux/vmalloc.h>
30#include <linux/mount.h> /* mnt_want_write(), mnt_drop_write() */
29#include <linux/nilfs2_fs.h> 31#include <linux/nilfs2_fs.h>
30#include "nilfs.h" 32#include "nilfs.h"
31#include "segment.h" 33#include "segment.h"
@@ -107,20 +109,28 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
107 109
108 if (!capable(CAP_SYS_ADMIN)) 110 if (!capable(CAP_SYS_ADMIN))
109 return -EPERM; 111 return -EPERM;
112
113 ret = mnt_want_write(filp->f_path.mnt);
114 if (ret)
115 return ret;
116
117 ret = -EFAULT;
110 if (copy_from_user(&cpmode, argp, sizeof(cpmode))) 118 if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
111 return -EFAULT; 119 goto out;
112 120
113 mutex_lock(&nilfs->ns_mount_mutex); 121 mutex_lock(&nilfs->ns_mount_mutex);
122
114 nilfs_transaction_begin(inode->i_sb, &ti, 0); 123 nilfs_transaction_begin(inode->i_sb, &ti, 0);
115 ret = nilfs_cpfile_change_cpmode( 124 ret = nilfs_cpfile_change_cpmode(
116 cpfile, cpmode.cm_cno, cpmode.cm_mode); 125 cpfile, cpmode.cm_cno, cpmode.cm_mode);
117 if (unlikely(ret < 0)) { 126 if (unlikely(ret < 0))
118 nilfs_transaction_abort(inode->i_sb); 127 nilfs_transaction_abort(inode->i_sb);
119 mutex_unlock(&nilfs->ns_mount_mutex); 128 else
120 return ret; 129 nilfs_transaction_commit(inode->i_sb); /* never fails */
121 } 130
122 nilfs_transaction_commit(inode->i_sb); /* never fails */
123 mutex_unlock(&nilfs->ns_mount_mutex); 131 mutex_unlock(&nilfs->ns_mount_mutex);
132out:
133 mnt_drop_write(filp->f_path.mnt);
124 return ret; 134 return ret;
125} 135}
126 136
@@ -135,16 +145,23 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
135 145
136 if (!capable(CAP_SYS_ADMIN)) 146 if (!capable(CAP_SYS_ADMIN))
137 return -EPERM; 147 return -EPERM;
148
149 ret = mnt_want_write(filp->f_path.mnt);
150 if (ret)
151 return ret;
152
153 ret = -EFAULT;
138 if (copy_from_user(&cno, argp, sizeof(cno))) 154 if (copy_from_user(&cno, argp, sizeof(cno)))
139 return -EFAULT; 155 goto out;
140 156
141 nilfs_transaction_begin(inode->i_sb, &ti, 0); 157 nilfs_transaction_begin(inode->i_sb, &ti, 0);
142 ret = nilfs_cpfile_delete_checkpoint(cpfile, cno); 158 ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
143 if (unlikely(ret < 0)) { 159 if (unlikely(ret < 0))
144 nilfs_transaction_abort(inode->i_sb); 160 nilfs_transaction_abort(inode->i_sb);
145 return ret; 161 else
146 } 162 nilfs_transaction_commit(inode->i_sb); /* never fails */
147 nilfs_transaction_commit(inode->i_sb); /* never fails */ 163out:
164 mnt_drop_write(filp->f_path.mnt);
148 return ret; 165 return ret;
149} 166}
150 167
@@ -480,7 +497,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
480 unsigned int cmd, void __user *argp) 497 unsigned int cmd, void __user *argp)
481{ 498{
482 struct nilfs_argv argv[5]; 499 struct nilfs_argv argv[5];
483 const static size_t argsz[5] = { 500 static const size_t argsz[5] = {
484 sizeof(struct nilfs_vdesc), 501 sizeof(struct nilfs_vdesc),
485 sizeof(struct nilfs_period), 502 sizeof(struct nilfs_period),
486 sizeof(__u64), 503 sizeof(__u64),
@@ -496,12 +513,19 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
496 if (!capable(CAP_SYS_ADMIN)) 513 if (!capable(CAP_SYS_ADMIN))
497 return -EPERM; 514 return -EPERM;
498 515
516 ret = mnt_want_write(filp->f_path.mnt);
517 if (ret)
518 return ret;
519
520 ret = -EFAULT;
499 if (copy_from_user(argv, argp, sizeof(argv))) 521 if (copy_from_user(argv, argp, sizeof(argv)))
500 return -EFAULT; 522 goto out;
501 523
524 ret = -EINVAL;
502 nsegs = argv[4].v_nmembs; 525 nsegs = argv[4].v_nmembs;
503 if (argv[4].v_size != argsz[4]) 526 if (argv[4].v_size != argsz[4])
504 return -EINVAL; 527 goto out;
528
505 /* 529 /*
506 * argv[4] points to segment numbers this ioctl cleans. We 530 * argv[4] points to segment numbers this ioctl cleans. We
507 * use kmalloc() for its buffer because memory used for the 531 * use kmalloc() for its buffer because memory used for the
@@ -509,9 +533,10 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
509 */ 533 */
510 kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base, 534 kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base,
511 nsegs * sizeof(__u64)); 535 nsegs * sizeof(__u64));
512 if (IS_ERR(kbufs[4])) 536 if (IS_ERR(kbufs[4])) {
513 return PTR_ERR(kbufs[4]); 537 ret = PTR_ERR(kbufs[4]);
514 538 goto out;
539 }
515 nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 540 nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
516 541
517 for (n = 0; n < 4; n++) { 542 for (n = 0; n < 4; n++) {
@@ -563,10 +588,12 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
563 nilfs_remove_all_gcinode(nilfs); 588 nilfs_remove_all_gcinode(nilfs);
564 clear_nilfs_gc_running(nilfs); 589 clear_nilfs_gc_running(nilfs);
565 590
566 out_free: 591out_free:
567 while (--n >= 0) 592 while (--n >= 0)
568 vfree(kbufs[n]); 593 vfree(kbufs[n]);
569 kfree(kbufs[4]); 594 kfree(kbufs[4]);
595out:
596 mnt_drop_write(filp->f_path.mnt);
570 return ret; 597 return ret;
571} 598}
572 599
@@ -575,13 +602,17 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
575{ 602{
576 __u64 cno; 603 __u64 cno;
577 int ret; 604 int ret;
605 struct the_nilfs *nilfs;
578 606
579 ret = nilfs_construct_segment(inode->i_sb); 607 ret = nilfs_construct_segment(inode->i_sb);
580 if (ret < 0) 608 if (ret < 0)
581 return ret; 609 return ret;
582 610
583 if (argp != NULL) { 611 if (argp != NULL) {
584 cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1; 612 nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
613 down_read(&nilfs->ns_segctor_sem);
614 cno = nilfs->ns_cno - 1;
615 up_read(&nilfs->ns_segctor_sem);
585 if (copy_to_user(argp, &cno, sizeof(cno))) 616 if (copy_to_user(argp, &cno, sizeof(cno)))
586 return -EFAULT; 617 return -EFAULT;
587 } 618 }
@@ -618,7 +649,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
618long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 649long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
619{ 650{
620 struct inode *inode = filp->f_dentry->d_inode; 651 struct inode *inode = filp->f_dentry->d_inode;
621 void __user *argp = (void * __user *)arg; 652 void __user *argp = (void __user *)arg;
622 653
623 switch (cmd) { 654 switch (cmd) {
624 case NILFS_IOCTL_CHANGE_CPMODE: 655 case NILFS_IOCTL_CHANGE_CPMODE:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 06713ffcc7f2..024be8c35bb6 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -26,6 +26,7 @@
26#include <linux/writeback.h> 26#include <linux/writeback.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/swap.h> 28#include <linux/swap.h>
29#include <linux/slab.h>
29#include "nilfs.h" 30#include "nilfs.h"
30#include "segment.h" 31#include "segment.h"
31#include "page.h" 32#include "page.h"
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 07ba838ef089..ad6ed2cf19b4 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -67,7 +67,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
67 if (dentry->d_name.len > NILFS_NAME_LEN) 67 if (dentry->d_name.len > NILFS_NAME_LEN)
68 return ERR_PTR(-ENAMETOOLONG); 68 return ERR_PTR(-ENAMETOOLONG);
69 69
70 ino = nilfs_inode_by_name(dir, dentry); 70 ino = nilfs_inode_by_name(dir, &dentry->d_name);
71 inode = NULL; 71 inode = NULL;
72 if (ino) { 72 if (ino) {
73 inode = nilfs_iget(dir->i_sb, ino); 73 inode = nilfs_iget(dir->i_sb, ino);
@@ -81,10 +81,7 @@ struct dentry *nilfs_get_parent(struct dentry *child)
81{ 81{
82 unsigned long ino; 82 unsigned long ino;
83 struct inode *inode; 83 struct inode *inode;
84 struct dentry dotdot; 84 struct qstr dotdot = {.name = "..", .len = 2};
85
86 dotdot.d_name.name = "..";
87 dotdot.d_name.len = 2;
88 85
89 ino = nilfs_inode_by_name(child->d_inode, &dotdot); 86 ino = nilfs_inode_by_name(child->d_inode, &dotdot);
90 if (!ino) 87 if (!ino)
@@ -296,7 +293,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
296 int err; 293 int err;
297 294
298 err = -ENOENT; 295 err = -ENOENT;
299 de = nilfs_find_entry(dir, dentry, &page); 296 de = nilfs_find_entry(dir, &dentry->d_name, &page);
300 if (!de) 297 if (!de)
301 goto out; 298 goto out;
302 299
@@ -389,7 +386,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
389 return err; 386 return err;
390 387
391 err = -ENOENT; 388 err = -ENOENT;
392 old_de = nilfs_find_entry(old_dir, old_dentry, &old_page); 389 old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page);
393 if (!old_de) 390 if (!old_de)
394 goto out; 391 goto out;
395 392
@@ -409,7 +406,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
409 goto out_dir; 406 goto out_dir;
410 407
411 err = -ENOENT; 408 err = -ENOENT;
412 new_de = nilfs_find_entry(new_dir, new_dentry, &new_page); 409 new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
413 if (!new_de) 410 if (!new_de)
414 goto out_dir; 411 goto out_dir;
415 inc_nlink(old_inode); 412 inc_nlink(old_inode);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 4da6f67e9a91..8723e5bfd071 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -217,10 +217,10 @@ static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
217 217
218/* dir.c */ 218/* dir.c */
219extern int nilfs_add_link(struct dentry *, struct inode *); 219extern int nilfs_add_link(struct dentry *, struct inode *);
220extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *); 220extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
221extern int nilfs_make_empty(struct inode *, struct inode *); 221extern int nilfs_make_empty(struct inode *, struct inode *);
222extern struct nilfs_dir_entry * 222extern struct nilfs_dir_entry *
223nilfs_find_entry(struct inode *, struct dentry *, struct page **); 223nilfs_find_entry(struct inode *, const struct qstr *, struct page **);
224extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *); 224extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
225extern int nilfs_empty_dir(struct inode *); 225extern int nilfs_empty_dir(struct inode *);
226extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **); 226extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a2692bbc7b50..8de3e1e48130 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -29,6 +29,7 @@
29#include <linux/list.h> 29#include <linux/list.h>
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/pagevec.h> 31#include <linux/pagevec.h>
32#include <linux/gfp.h>
32#include "nilfs.h" 33#include "nilfs.h"
33#include "page.h" 34#include "page.h"
34#include "mdt.h" 35#include "mdt.h"
@@ -292,7 +293,7 @@ void nilfs_free_private_page(struct page *page)
292 * @src: source page 293 * @src: source page
293 * @copy_dirty: flag whether to copy dirty states on the page's buffer heads. 294 * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
294 * 295 *
295 * This fuction is for both data pages and btnode pages. The dirty flag 296 * This function is for both data pages and btnode pages. The dirty flag
296 * should be treated by caller. The page must not be under i/o. 297 * should be treated by caller. The page must not be under i/o.
297 * Both src and dst page must be locked 298 * Both src and dst page must be locked
298 */ 299 */
@@ -388,7 +389,7 @@ repeat:
388} 389}
389 390
390/** 391/**
391 * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache 392 * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache
392 * @dmap: destination page cache 393 * @dmap: destination page cache
393 * @smap: source page cache 394 * @smap: source page cache
394 * 395 *
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index c9c96c7825dc..ba43146f3c30 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -23,6 +23,7 @@
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <linux/swap.h> 25#include <linux/swap.h>
26#include <linux/slab.h>
26#include <linux/crc32.h> 27#include <linux/crc32.h>
27#include "nilfs.h" 28#include "nilfs.h"
28#include "segment.h" 29#include "segment.h"
@@ -39,7 +40,6 @@ enum {
39 NILFS_SEG_FAIL_IO, 40 NILFS_SEG_FAIL_IO,
40 NILFS_SEG_FAIL_MAGIC, 41 NILFS_SEG_FAIL_MAGIC,
41 NILFS_SEG_FAIL_SEQ, 42 NILFS_SEG_FAIL_SEQ,
42 NILFS_SEG_FAIL_CHECKSUM_SEGSUM,
43 NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT, 43 NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
44 NILFS_SEG_FAIL_CHECKSUM_FULL, 44 NILFS_SEG_FAIL_CHECKSUM_FULL,
45 NILFS_SEG_FAIL_CONSISTENCY, 45 NILFS_SEG_FAIL_CONSISTENCY,
@@ -71,10 +71,6 @@ static int nilfs_warn_segment_error(int err)
71 printk(KERN_WARNING 71 printk(KERN_WARNING
72 "NILFS warning: Sequence number mismatch\n"); 72 "NILFS warning: Sequence number mismatch\n");
73 break; 73 break;
74 case NILFS_SEG_FAIL_CHECKSUM_SEGSUM:
75 printk(KERN_WARNING
76 "NILFS warning: Checksum error in segment summary\n");
77 break;
78 case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT: 74 case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
79 printk(KERN_WARNING 75 printk(KERN_WARNING
80 "NILFS warning: Checksum error in super root\n"); 76 "NILFS warning: Checksum error in super root\n");
@@ -206,19 +202,15 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
206 * @pseg_start: start disk block number of partial segment 202 * @pseg_start: start disk block number of partial segment
207 * @seg_seq: sequence number requested 203 * @seg_seq: sequence number requested
208 * @ssi: pointer to nilfs_segsum_info struct to store information 204 * @ssi: pointer to nilfs_segsum_info struct to store information
209 * @full_check: full check flag
210 * (0: only checks segment summary CRC, 1: data CRC)
211 */ 205 */
212static int 206static int
213load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start, 207load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
214 u64 seg_seq, struct nilfs_segsum_info *ssi, 208 u64 seg_seq, struct nilfs_segsum_info *ssi)
215 int full_check)
216{ 209{
217 struct buffer_head *bh_sum; 210 struct buffer_head *bh_sum;
218 struct nilfs_segment_summary *sum; 211 struct nilfs_segment_summary *sum;
219 unsigned long offset, nblock; 212 unsigned long nblock;
220 u64 check_bytes; 213 u32 crc;
221 u32 crc, crc_sum;
222 int ret = NILFS_SEG_FAIL_IO; 214 int ret = NILFS_SEG_FAIL_IO;
223 215
224 bh_sum = sb_bread(sbi->s_super, pseg_start); 216 bh_sum = sb_bread(sbi->s_super, pseg_start);
@@ -237,34 +229,24 @@ load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
237 ret = NILFS_SEG_FAIL_SEQ; 229 ret = NILFS_SEG_FAIL_SEQ;
238 goto failed; 230 goto failed;
239 } 231 }
240 if (full_check) {
241 offset = sizeof(sum->ss_datasum);
242 check_bytes =
243 ((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits);
244 nblock = ssi->nblocks;
245 crc_sum = le32_to_cpu(sum->ss_datasum);
246 ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
247 } else { /* only checks segment summary */
248 offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum);
249 check_bytes = ssi->sumbytes;
250 nblock = ssi->nsumblk;
251 crc_sum = le32_to_cpu(sum->ss_sumsum);
252 ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM;
253 }
254 232
233 nblock = ssi->nblocks;
255 if (unlikely(nblock == 0 || 234 if (unlikely(nblock == 0 ||
256 nblock > sbi->s_nilfs->ns_blocks_per_segment)) { 235 nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
257 /* This limits the number of blocks read in the CRC check */ 236 /* This limits the number of blocks read in the CRC check */
258 ret = NILFS_SEG_FAIL_CONSISTENCY; 237 ret = NILFS_SEG_FAIL_CONSISTENCY;
259 goto failed; 238 goto failed;
260 } 239 }
261 if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes, 240 if (calc_crc_cont(sbi, bh_sum, &crc, sizeof(sum->ss_datasum),
241 ((u64)nblock << sbi->s_super->s_blocksize_bits),
262 pseg_start, nblock)) { 242 pseg_start, nblock)) {
263 ret = NILFS_SEG_FAIL_IO; 243 ret = NILFS_SEG_FAIL_IO;
264 goto failed; 244 goto failed;
265 } 245 }
266 if (crc == crc_sum) 246 if (crc == le32_to_cpu(sum->ss_datasum))
267 ret = 0; 247 ret = 0;
248 else
249 ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
268 failed: 250 failed:
269 brelse(bh_sum); 251 brelse(bh_sum);
270 out: 252 out:
@@ -598,7 +580,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
598 580
599 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { 581 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
600 582
601 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); 583 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
602 if (ret) { 584 if (ret) {
603 if (ret == NILFS_SEG_FAIL_IO) { 585 if (ret == NILFS_SEG_FAIL_IO) {
604 err = -EIO; 586 err = -EIO;
@@ -821,7 +803,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
821 803
822 for (;;) { 804 for (;;) {
823 /* Load segment summary */ 805 /* Load segment summary */
824 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); 806 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
825 if (ret) { 807 if (ret) {
826 if (ret == NILFS_SEG_FAIL_IO) 808 if (ret == NILFS_SEG_FAIL_IO)
827 goto failed; 809 goto failed;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 645c78656aa0..17851f77f739 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -25,6 +25,7 @@
25#include <linux/writeback.h> 25#include <linux/writeback.h>
26#include <linux/crc32.h> 26#include <linux/crc32.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/slab.h>
28#include "page.h" 29#include "page.h"
29#include "segbuf.h" 30#include "segbuf.h"
30 31
@@ -32,7 +33,7 @@
32struct nilfs_write_info { 33struct nilfs_write_info {
33 struct the_nilfs *nilfs; 34 struct the_nilfs *nilfs;
34 struct bio *bio; 35 struct bio *bio;
35 int start, end; /* The region to be submitted */ 36 int start, end; /* The region to be submitted */
36 int rest_blocks; 37 int rest_blocks;
37 int max_pages; 38 int max_pages;
38 int nr_vecs; 39 int nr_vecs;
@@ -40,6 +41,11 @@ struct nilfs_write_info {
40}; 41};
41 42
42 43
44static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
45 struct the_nilfs *nilfs);
46static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
47
48
43static struct kmem_cache *nilfs_segbuf_cachep; 49static struct kmem_cache *nilfs_segbuf_cachep;
44 50
45static void nilfs_segbuf_init_once(void *obj) 51static void nilfs_segbuf_init_once(void *obj)
@@ -169,7 +175,7 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
169} 175}
170 176
171/* 177/*
172 * Setup segument summary 178 * Setup segment summary
173 */ 179 */
174void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf) 180void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
175{ 181{
@@ -302,17 +308,30 @@ void nilfs_truncate_logs(struct list_head *logs,
302 } 308 }
303} 309}
304 310
311int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
312{
313 struct nilfs_segment_buffer *segbuf;
314 int ret = 0;
315
316 list_for_each_entry(segbuf, logs, sb_list) {
317 ret = nilfs_segbuf_write(segbuf, nilfs);
318 if (ret)
319 break;
320 }
321 return ret;
322}
323
305int nilfs_wait_on_logs(struct list_head *logs) 324int nilfs_wait_on_logs(struct list_head *logs)
306{ 325{
307 struct nilfs_segment_buffer *segbuf; 326 struct nilfs_segment_buffer *segbuf;
308 int err; 327 int err, ret = 0;
309 328
310 list_for_each_entry(segbuf, logs, sb_list) { 329 list_for_each_entry(segbuf, logs, sb_list) {
311 err = nilfs_segbuf_wait(segbuf); 330 err = nilfs_segbuf_wait(segbuf);
312 if (err) 331 if (err && !ret)
313 return err; 332 ret = err;
314 } 333 }
315 return 0; 334 return ret;
316} 335}
317 336
318/* 337/*
@@ -452,8 +471,8 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
452 * 471 *
453 * %-ENOMEM - Insufficient memory available. 472 * %-ENOMEM - Insufficient memory available.
454 */ 473 */
455int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, 474static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
456 struct the_nilfs *nilfs) 475 struct the_nilfs *nilfs)
457{ 476{
458 struct nilfs_write_info wi; 477 struct nilfs_write_info wi;
459 struct buffer_head *bh; 478 struct buffer_head *bh;
@@ -496,7 +515,7 @@ int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
496 * 515 *
497 * %-EIO - I/O error 516 * %-EIO - I/O error
498 */ 517 */
499int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf) 518static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
500{ 519{
501 int err = 0; 520 int err = 0;
502 521
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 6af1630fb401..94dfd3517bc0 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -166,13 +166,10 @@ nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
166 segbuf->sb_sum.nfileblk++; 166 segbuf->sb_sum.nfileblk++;
167} 167}
168 168
169int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
170 struct the_nilfs *nilfs);
171int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
172
173void nilfs_clear_logs(struct list_head *logs); 169void nilfs_clear_logs(struct list_head *logs);
174void nilfs_truncate_logs(struct list_head *logs, 170void nilfs_truncate_logs(struct list_head *logs,
175 struct nilfs_segment_buffer *last); 171 struct nilfs_segment_buffer *last);
172int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
176int nilfs_wait_on_logs(struct list_head *logs); 173int nilfs_wait_on_logs(struct list_head *logs);
177 174
178static inline void nilfs_destroy_logs(struct list_head *logs) 175static inline void nilfs_destroy_logs(struct list_head *logs)
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 17584c524486..6a7dbd8451db 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -32,6 +32,7 @@
32#include <linux/kthread.h> 32#include <linux/kthread.h>
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/pagevec.h> 34#include <linux/pagevec.h>
35#include <linux/slab.h>
35#include "nilfs.h" 36#include "nilfs.h"
36#include "btnode.h" 37#include "btnode.h"
37#include "page.h" 38#include "page.h"
@@ -141,7 +142,7 @@ int nilfs_init_transaction_cache(void)
141} 142}
142 143
143/** 144/**
144 * nilfs_detroy_transaction_cache - destroy the cache for transaction info 145 * nilfs_destroy_transaction_cache - destroy the cache for transaction info
145 * 146 *
146 * nilfs_destroy_transaction_cache() frees the slab cache for the struct 147 * nilfs_destroy_transaction_cache() frees the slab cache for the struct
147 * nilfs_transaction_info. 148 * nilfs_transaction_info.
@@ -201,7 +202,7 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
201 * This function allocates a nilfs_transaction_info struct to keep context 202 * This function allocates a nilfs_transaction_info struct to keep context
202 * information on it. It is initialized and hooked onto the current task in 203 * information on it. It is initialized and hooked onto the current task in
203 * the outermost call. If a pre-allocated struct is given to @ti, it is used 204 * the outermost call. If a pre-allocated struct is given to @ti, it is used
204 * instead; othewise a new struct is assigned from a slab. 205 * instead; otherwise a new struct is assigned from a slab.
205 * 206 *
206 * When @vacancy_check flag is set, this function will check the amount of 207 * When @vacancy_check flag is set, this function will check the amount of
207 * free space, and will wait for the GC to reclaim disk space if low capacity. 208 * free space, and will wait for the GC to reclaim disk space if low capacity.
@@ -1510,6 +1511,12 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1510 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE) 1511 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
1511 break; 1512 break;
1512 1513
1514 nilfs_clear_logs(&sci->sc_segbufs);
1515
1516 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1517 if (unlikely(err))
1518 return err;
1519
1513 if (sci->sc_stage.flags & NILFS_CF_SUFREED) { 1520 if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
1514 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, 1521 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
1515 sci->sc_freesegs, 1522 sci->sc_freesegs,
@@ -1517,12 +1524,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1517 NULL); 1524 NULL);
1518 WARN_ON(err); /* do not happen */ 1525 WARN_ON(err); /* do not happen */
1519 } 1526 }
1520 nilfs_clear_logs(&sci->sc_segbufs);
1521
1522 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1523 if (unlikely(err))
1524 return err;
1525
1526 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); 1527 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
1527 sci->sc_stage = prev_stage; 1528 sci->sc_stage = prev_stage;
1528 } 1529 }
@@ -1764,14 +1765,9 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
1764static int nilfs_segctor_write(struct nilfs_sc_info *sci, 1765static int nilfs_segctor_write(struct nilfs_sc_info *sci,
1765 struct the_nilfs *nilfs) 1766 struct the_nilfs *nilfs)
1766{ 1767{
1767 struct nilfs_segment_buffer *segbuf; 1768 int ret;
1768 int ret = 0;
1769 1769
1770 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { 1770 ret = nilfs_write_logs(&sci->sc_segbufs, nilfs);
1771 ret = nilfs_segbuf_write(segbuf, nilfs);
1772 if (ret)
1773 break;
1774 }
1775 list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs); 1771 list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs);
1776 return ret; 1772 return ret;
1777} 1773}
@@ -1902,8 +1898,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
1902 1898
1903 list_splice_tail_init(&sci->sc_write_logs, &logs); 1899 list_splice_tail_init(&sci->sc_write_logs, &logs);
1904 ret = nilfs_wait_on_logs(&logs); 1900 ret = nilfs_wait_on_logs(&logs);
1905 if (ret) 1901 nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret ? : err);
1906 nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret);
1907 1902
1908 list_splice_tail_init(&sci->sc_segbufs, &logs); 1903 list_splice_tail_init(&sci->sc_segbufs, &logs);
1909 nilfs_cancel_segusage(&logs, nilfs->ns_sufile); 1904 nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
@@ -1937,8 +1932,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1937{ 1932{
1938 struct nilfs_segment_buffer *segbuf; 1933 struct nilfs_segment_buffer *segbuf;
1939 struct page *bd_page = NULL, *fs_page = NULL; 1934 struct page *bd_page = NULL, *fs_page = NULL;
1940 struct nilfs_sb_info *sbi = sci->sc_sbi; 1935 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
1941 struct the_nilfs *nilfs = sbi->s_nilfs;
1942 int update_sr = (sci->sc_super_root != NULL); 1936 int update_sr = (sci->sc_super_root != NULL);
1943 1937
1944 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { 1938 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
@@ -2020,7 +2014,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
2020 if (update_sr) { 2014 if (update_sr) {
2021 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, 2015 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
2022 segbuf->sb_sum.seg_seq, nilfs->ns_cno++); 2016 segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
2023 sbi->s_super->s_dirt = 1; 2017 set_nilfs_sb_dirty(nilfs);
2024 2018
2025 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); 2019 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
2026 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); 2020 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
@@ -2220,7 +2214,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2220} 2214}
2221 2215
2222/** 2216/**
2223 * nilfs_secgtor_start_timer - set timer of background write 2217 * nilfs_segctor_start_timer - set timer of background write
2224 * @sci: nilfs_sc_info 2218 * @sci: nilfs_sc_info
2225 * 2219 *
2226 * If the timer has already been set, it ignores the new request. 2220 * If the timer has already been set, it ignores the new request.
@@ -2425,43 +2419,43 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
2425 return err; 2419 return err;
2426} 2420}
2427 2421
2428struct nilfs_segctor_req {
2429 int mode;
2430 __u32 seq_accepted;
2431 int sc_err; /* construction failure */
2432 int sb_err; /* super block writeback failure */
2433};
2434
2435#define FLUSH_FILE_BIT (0x1) /* data file only */ 2422#define FLUSH_FILE_BIT (0x1) /* data file only */
2436#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */ 2423#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */
2437 2424
2438static void nilfs_segctor_accept(struct nilfs_sc_info *sci, 2425/**
2439 struct nilfs_segctor_req *req) 2426 * nilfs_segctor_accept - record accepted sequence count of log-write requests
2427 * @sci: segment constructor object
2428 */
2429static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
2440{ 2430{
2441 req->sc_err = req->sb_err = 0;
2442 spin_lock(&sci->sc_state_lock); 2431 spin_lock(&sci->sc_state_lock);
2443 req->seq_accepted = sci->sc_seq_request; 2432 sci->sc_seq_accepted = sci->sc_seq_request;
2444 spin_unlock(&sci->sc_state_lock); 2433 spin_unlock(&sci->sc_state_lock);
2445 2434
2446 if (sci->sc_timer) 2435 if (sci->sc_timer)
2447 del_timer_sync(sci->sc_timer); 2436 del_timer_sync(sci->sc_timer);
2448} 2437}
2449 2438
2450static void nilfs_segctor_notify(struct nilfs_sc_info *sci, 2439/**
2451 struct nilfs_segctor_req *req) 2440 * nilfs_segctor_notify - notify the result of request to caller threads
2441 * @sci: segment constructor object
2442 * @mode: mode of log forming
2443 * @err: error code to be notified
2444 */
2445static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
2452{ 2446{
2453 /* Clear requests (even when the construction failed) */ 2447 /* Clear requests (even when the construction failed) */
2454 spin_lock(&sci->sc_state_lock); 2448 spin_lock(&sci->sc_state_lock);
2455 2449
2456 if (req->mode == SC_LSEG_SR) { 2450 if (mode == SC_LSEG_SR) {
2457 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; 2451 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
2458 sci->sc_seq_done = req->seq_accepted; 2452 sci->sc_seq_done = sci->sc_seq_accepted;
2459 nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err); 2453 nilfs_segctor_wakeup(sci, err);
2460 sci->sc_flush_request = 0; 2454 sci->sc_flush_request = 0;
2461 } else { 2455 } else {
2462 if (req->mode == SC_FLUSH_FILE) 2456 if (mode == SC_FLUSH_FILE)
2463 sci->sc_flush_request &= ~FLUSH_FILE_BIT; 2457 sci->sc_flush_request &= ~FLUSH_FILE_BIT;
2464 else if (req->mode == SC_FLUSH_DAT) 2458 else if (mode == SC_FLUSH_DAT)
2465 sci->sc_flush_request &= ~FLUSH_DAT_BIT; 2459 sci->sc_flush_request &= ~FLUSH_DAT_BIT;
2466 2460
2467 /* re-enable timer if checkpoint creation was not done */ 2461 /* re-enable timer if checkpoint creation was not done */
@@ -2472,30 +2466,37 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
2472 spin_unlock(&sci->sc_state_lock); 2466 spin_unlock(&sci->sc_state_lock);
2473} 2467}
2474 2468
2475static int nilfs_segctor_construct(struct nilfs_sc_info *sci, 2469/**
2476 struct nilfs_segctor_req *req) 2470 * nilfs_segctor_construct - form logs and write them to disk
2471 * @sci: segment constructor object
2472 * @mode: mode of log forming
2473 */
2474static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
2477{ 2475{
2478 struct nilfs_sb_info *sbi = sci->sc_sbi; 2476 struct nilfs_sb_info *sbi = sci->sc_sbi;
2479 struct the_nilfs *nilfs = sbi->s_nilfs; 2477 struct the_nilfs *nilfs = sbi->s_nilfs;
2480 int err = 0; 2478 int err = 0;
2481 2479
2480 nilfs_segctor_accept(sci);
2481
2482 if (nilfs_discontinued(nilfs)) 2482 if (nilfs_discontinued(nilfs))
2483 req->mode = SC_LSEG_SR; 2483 mode = SC_LSEG_SR;
2484 if (!nilfs_segctor_confirm(sci)) { 2484 if (!nilfs_segctor_confirm(sci))
2485 err = nilfs_segctor_do_construct(sci, req->mode); 2485 err = nilfs_segctor_do_construct(sci, mode);
2486 req->sc_err = err; 2486
2487 }
2488 if (likely(!err)) { 2487 if (likely(!err)) {
2489 if (req->mode != SC_FLUSH_DAT) 2488 if (mode != SC_FLUSH_DAT)
2490 atomic_set(&nilfs->ns_ndirtyblks, 0); 2489 atomic_set(&nilfs->ns_ndirtyblks, 0);
2491 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && 2490 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
2492 nilfs_discontinued(nilfs)) { 2491 nilfs_discontinued(nilfs)) {
2493 down_write(&nilfs->ns_sem); 2492 down_write(&nilfs->ns_sem);
2494 req->sb_err = nilfs_commit_super(sbi, 2493 err = nilfs_commit_super(
2495 nilfs_altsb_need_update(nilfs)); 2494 sbi, nilfs_altsb_need_update(nilfs));
2496 up_write(&nilfs->ns_sem); 2495 up_write(&nilfs->ns_sem);
2497 } 2496 }
2498 } 2497 }
2498
2499 nilfs_segctor_notify(sci, mode, err);
2499 return err; 2500 return err;
2500} 2501}
2501 2502
@@ -2526,7 +2527,6 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2526 struct nilfs_sc_info *sci = NILFS_SC(sbi); 2527 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2527 struct the_nilfs *nilfs = sbi->s_nilfs; 2528 struct the_nilfs *nilfs = sbi->s_nilfs;
2528 struct nilfs_transaction_info ti; 2529 struct nilfs_transaction_info ti;
2529 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2530 int err; 2530 int err;
2531 2531
2532 if (unlikely(!sci)) 2532 if (unlikely(!sci))
@@ -2547,10 +2547,8 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2547 list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes); 2547 list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes);
2548 2548
2549 for (;;) { 2549 for (;;) {
2550 nilfs_segctor_accept(sci, &req); 2550 err = nilfs_segctor_construct(sci, SC_LSEG_SR);
2551 err = nilfs_segctor_construct(sci, &req);
2552 nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes); 2551 nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
2553 nilfs_segctor_notify(sci, &req);
2554 2552
2555 if (likely(!err)) 2553 if (likely(!err))
2556 break; 2554 break;
@@ -2560,6 +2558,16 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2560 set_current_state(TASK_INTERRUPTIBLE); 2558 set_current_state(TASK_INTERRUPTIBLE);
2561 schedule_timeout(sci->sc_interval); 2559 schedule_timeout(sci->sc_interval);
2562 } 2560 }
2561 if (nilfs_test_opt(sbi, DISCARD)) {
2562 int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
2563 sci->sc_nfreesegs);
2564 if (ret) {
2565 printk(KERN_WARNING
2566 "NILFS warning: error %d on discard request, "
2567 "turning discards off for the device\n", ret);
2568 nilfs_clear_opt(sbi, DISCARD);
2569 }
2570 }
2563 2571
2564 out_unlock: 2572 out_unlock:
2565 sci->sc_freesegs = NULL; 2573 sci->sc_freesegs = NULL;
@@ -2573,13 +2581,9 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
2573{ 2581{
2574 struct nilfs_sb_info *sbi = sci->sc_sbi; 2582 struct nilfs_sb_info *sbi = sci->sc_sbi;
2575 struct nilfs_transaction_info ti; 2583 struct nilfs_transaction_info ti;
2576 struct nilfs_segctor_req req = { .mode = mode };
2577 2584
2578 nilfs_transaction_lock(sbi, &ti, 0); 2585 nilfs_transaction_lock(sbi, &ti, 0);
2579 2586 nilfs_segctor_construct(sci, mode);
2580 nilfs_segctor_accept(sci, &req);
2581 nilfs_segctor_construct(sci, &req);
2582 nilfs_segctor_notify(sci, &req);
2583 2587
2584 /* 2588 /*
2585 * Unclosed segment should be retried. We do this using sc_timer. 2589 * Unclosed segment should be retried. We do this using sc_timer.
@@ -2635,6 +2639,7 @@ static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
2635static int nilfs_segctor_thread(void *arg) 2639static int nilfs_segctor_thread(void *arg)
2636{ 2640{
2637 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; 2641 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
2642 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
2638 struct timer_list timer; 2643 struct timer_list timer;
2639 int timeout = 0; 2644 int timeout = 0;
2640 2645
@@ -2680,7 +2685,6 @@ static int nilfs_segctor_thread(void *arg)
2680 } else { 2685 } else {
2681 DEFINE_WAIT(wait); 2686 DEFINE_WAIT(wait);
2682 int should_sleep = 1; 2687 int should_sleep = 1;
2683 struct the_nilfs *nilfs;
2684 2688
2685 prepare_to_wait(&sci->sc_wait_daemon, &wait, 2689 prepare_to_wait(&sci->sc_wait_daemon, &wait,
2686 TASK_INTERRUPTIBLE); 2690 TASK_INTERRUPTIBLE);
@@ -2701,8 +2705,8 @@ static int nilfs_segctor_thread(void *arg)
2701 finish_wait(&sci->sc_wait_daemon, &wait); 2705 finish_wait(&sci->sc_wait_daemon, &wait);
2702 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && 2706 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2703 time_after_eq(jiffies, sci->sc_timer->expires)); 2707 time_after_eq(jiffies, sci->sc_timer->expires));
2704 nilfs = sci->sc_sbi->s_nilfs; 2708
2705 if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs)) 2709 if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
2706 set_nilfs_discontinued(nilfs); 2710 set_nilfs_discontinued(nilfs);
2707 } 2711 }
2708 goto loop; 2712 goto loop;
@@ -2797,12 +2801,9 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
2797 do { 2801 do {
2798 struct nilfs_sb_info *sbi = sci->sc_sbi; 2802 struct nilfs_sb_info *sbi = sci->sc_sbi;
2799 struct nilfs_transaction_info ti; 2803 struct nilfs_transaction_info ti;
2800 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2801 2804
2802 nilfs_transaction_lock(sbi, &ti, 0); 2805 nilfs_transaction_lock(sbi, &ti, 0);
2803 nilfs_segctor_accept(sci, &req); 2806 ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
2804 ret = nilfs_segctor_construct(sci, &req);
2805 nilfs_segctor_notify(sci, &req);
2806 nilfs_transaction_unlock(sbi); 2807 nilfs_transaction_unlock(sbi);
2807 2808
2808 } while (ret && retrycount-- > 0); 2809 } while (ret && retrycount-- > 0);
@@ -2829,7 +2830,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2829 || sci->sc_seq_request != sci->sc_seq_done); 2830 || sci->sc_seq_request != sci->sc_seq_done);
2830 spin_unlock(&sci->sc_state_lock); 2831 spin_unlock(&sci->sc_state_lock);
2831 2832
2832 if (flag || nilfs_segctor_confirm(sci)) 2833 if (flag || !nilfs_segctor_confirm(sci))
2833 nilfs_segctor_write_out(sci); 2834 nilfs_segctor_write_out(sci);
2834 2835
2835 WARN_ON(!list_empty(&sci->sc_copied_buffers)); 2836 WARN_ON(!list_empty(&sci->sc_copied_buffers));
@@ -2853,7 +2854,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2853 * @sbi: nilfs_sb_info 2854 * @sbi: nilfs_sb_info
2854 * 2855 *
2855 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info, 2856 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
2856 * initilizes it, and starts the segment constructor. 2857 * initializes it, and starts the segment constructor.
2857 * 2858 *
2858 * Return Value: On success, 0 is returned. On error, one of the following 2859 * Return Value: On success, 0 is returned. On error, one of the following
2859 * negative error code is returned. 2860 * negative error code is returned.
@@ -2865,8 +2866,15 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
2865 struct the_nilfs *nilfs = sbi->s_nilfs; 2866 struct the_nilfs *nilfs = sbi->s_nilfs;
2866 int err; 2867 int err;
2867 2868
2868 /* Each field of nilfs_segctor is cleared through the initialization 2869 if (NILFS_SC(sbi)) {
2869 of super-block info */ 2870 /*
2871 * This happens if the filesystem was remounted
2872 * read/write after nilfs_error degenerated it into a
2873 * read-only mount.
2874 */
2875 nilfs_detach_segment_constructor(sbi);
2876 }
2877
2870 sbi->s_sc_info = nilfs_segctor_new(sbi); 2878 sbi->s_sc_info = nilfs_segctor_new(sbi);
2871 if (!sbi->s_sc_info) 2879 if (!sbi->s_sc_info)
2872 return -ENOMEM; 2880 return -ENOMEM;
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 3d3ab2f9864c..82dfd6a686b9 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -30,7 +30,7 @@
30#include "sb.h" 30#include "sb.h"
31 31
32/** 32/**
33 * struct nilfs_recovery_info - Recovery infomation 33 * struct nilfs_recovery_info - Recovery information
34 * @ri_need_recovery: Recovery status 34 * @ri_need_recovery: Recovery status
35 * @ri_super_root: Block number of the last super root 35 * @ri_super_root: Block number of the last super root
36 * @ri_ri_cno: Number of the last checkpoint 36 * @ri_ri_cno: Number of the last checkpoint
@@ -71,7 +71,7 @@ struct nilfs_recovery_info {
71 */ 71 */
72struct nilfs_cstage { 72struct nilfs_cstage {
73 int scnt; 73 int scnt;
74 unsigned flags; 74 unsigned flags;
75 struct nilfs_inode_info *dirty_file_ptr; 75 struct nilfs_inode_info *dirty_file_ptr;
76 struct nilfs_inode_info *gc_inode_ptr; 76 struct nilfs_inode_info *gc_inode_ptr;
77}; 77};
@@ -116,6 +116,7 @@ struct nilfs_segsum_pointer {
116 * @sc_wait_daemon: Daemon wait queue 116 * @sc_wait_daemon: Daemon wait queue
117 * @sc_wait_task: Start/end wait queue to control segctord task 117 * @sc_wait_task: Start/end wait queue to control segctord task
118 * @sc_seq_request: Request counter 118 * @sc_seq_request: Request counter
119 * @sc_seq_accept: Accepted request count
119 * @sc_seq_done: Completion counter 120 * @sc_seq_done: Completion counter
120 * @sc_sync: Request of explicit sync operation 121 * @sc_sync: Request of explicit sync operation
121 * @sc_interval: Timeout value of background construction 122 * @sc_interval: Timeout value of background construction
@@ -169,6 +170,7 @@ struct nilfs_sc_info {
169 wait_queue_head_t sc_wait_task; 170 wait_queue_head_t sc_wait_task;
170 171
171 __u32 sc_seq_request; 172 __u32 sc_seq_request;
173 __u32 sc_seq_accepted;
172 __u32 sc_seq_done; 174 __u32 sc_seq_done;
173 175
174 int sc_sync; 176 int sc_sync;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index b6c36d0cc331..3c6cc6005c2e 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -18,7 +18,7 @@
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 * 19 *
20 * Written by Koji Sato <koji@osrg.net>. 20 * Written by Koji Sato <koji@osrg.net>.
21 * Rivised by Ryusuke Konishi <ryusuke@osrg.net>. 21 * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
22 */ 22 */
23 23
24#include <linux/kernel.h> 24#include <linux/kernel.h>
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8173faee31e6..0cdbc5e7655a 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -96,9 +96,6 @@ void nilfs_error(struct super_block *sb, const char *function,
96 if (!(sb->s_flags & MS_RDONLY)) { 96 if (!(sb->s_flags & MS_RDONLY)) {
97 struct the_nilfs *nilfs = sbi->s_nilfs; 97 struct the_nilfs *nilfs = sbi->s_nilfs;
98 98
99 if (!nilfs_test_opt(sbi, ERRORS_CONT))
100 nilfs_detach_segment_constructor(sbi);
101
102 down_write(&nilfs->ns_sem); 99 down_write(&nilfs->ns_sem);
103 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { 100 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
104 nilfs->ns_mount_state |= NILFS_ERROR_FS; 101 nilfs->ns_mount_state |= NILFS_ERROR_FS;
@@ -301,7 +298,7 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
301 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); 298 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
302 nilfs->ns_sbwtime[1] = t; 299 nilfs->ns_sbwtime[1] = t;
303 } 300 }
304 sbi->s_super->s_dirt = 0; 301 clear_nilfs_sb_dirty(nilfs);
305 return nilfs_sync_super(sbi, dupsb); 302 return nilfs_sync_super(sbi, dupsb);
306} 303}
307 304
@@ -345,7 +342,7 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
345 err = nilfs_construct_segment(sb); 342 err = nilfs_construct_segment(sb);
346 343
347 down_write(&nilfs->ns_sem); 344 down_write(&nilfs->ns_sem);
348 if (sb->s_dirt) 345 if (nilfs_sb_dirty(nilfs))
349 nilfs_commit_super(sbi, 1); 346 nilfs_commit_super(sbi, 1);
350 up_write(&nilfs->ns_sem); 347 up_write(&nilfs->ns_sem);
351 348
@@ -439,7 +436,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
439 /* 436 /*
440 * Compute the overhead 437 * Compute the overhead
441 * 438 *
442 * When distributing meta data blocks outside semgent structure, 439 * When distributing meta data blocks outside segment structure,
443 * We must count them as the overhead. 440 * We must count them as the overhead.
444 */ 441 */
445 overhead = 0; 442 overhead = 0;
@@ -481,6 +478,8 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
481 seq_printf(seq, ",order=strict"); 478 seq_printf(seq, ",order=strict");
482 if (nilfs_test_opt(sbi, NORECOVERY)) 479 if (nilfs_test_opt(sbi, NORECOVERY))
483 seq_printf(seq, ",norecovery"); 480 seq_printf(seq, ",norecovery");
481 if (nilfs_test_opt(sbi, DISCARD))
482 seq_printf(seq, ",discard");
484 483
485 return 0; 484 return 0;
486} 485}
@@ -550,7 +549,7 @@ static const struct export_operations nilfs_export_ops = {
550enum { 549enum {
551 Opt_err_cont, Opt_err_panic, Opt_err_ro, 550 Opt_err_cont, Opt_err_panic, Opt_err_ro,
552 Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, 551 Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
553 Opt_err, 552 Opt_discard, Opt_err,
554}; 553};
555 554
556static match_table_t tokens = { 555static match_table_t tokens = {
@@ -561,6 +560,7 @@ static match_table_t tokens = {
561 {Opt_snapshot, "cp=%u"}, 560 {Opt_snapshot, "cp=%u"},
562 {Opt_order, "order=%s"}, 561 {Opt_order, "order=%s"},
563 {Opt_norecovery, "norecovery"}, 562 {Opt_norecovery, "norecovery"},
563 {Opt_discard, "discard"},
564 {Opt_err, NULL} 564 {Opt_err, NULL}
565}; 565};
566 566
@@ -614,6 +614,9 @@ static int parse_options(char *options, struct super_block *sb)
614 case Opt_norecovery: 614 case Opt_norecovery:
615 nilfs_set_opt(sbi, NORECOVERY); 615 nilfs_set_opt(sbi, NORECOVERY);
616 break; 616 break;
617 case Opt_discard:
618 nilfs_set_opt(sbi, DISCARD);
619 break;
617 default: 620 default:
618 printk(KERN_ERR 621 printk(KERN_ERR
619 "NILFS: Unrecognized mount option \"%s\"\n", p); 622 "NILFS: Unrecognized mount option \"%s\"\n", p);
@@ -863,7 +866,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
863 if ((*flags & MS_RDONLY) && 866 if ((*flags & MS_RDONLY) &&
864 sbi->s_snapshot_cno != old_opts.snapshot_cno) { 867 sbi->s_snapshot_cno != old_opts.snapshot_cno) {
865 printk(KERN_WARNING "NILFS (device %s): couldn't " 868 printk(KERN_WARNING "NILFS (device %s): couldn't "
866 "remount to a different snapshot. \n", 869 "remount to a different snapshot.\n",
867 sb->s_id); 870 sb->s_id);
868 err = -EINVAL; 871 err = -EINVAL;
869 goto restore_opts; 872 goto restore_opts;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 6241e1722efc..33871f7e4f01 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -386,7 +386,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
386 386
387 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); 387 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
388 if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) { 388 if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
389 printk(KERN_ERR "NILFS: too short segment. \n"); 389 printk(KERN_ERR "NILFS: too short segment.\n");
390 return -EINVAL; 390 return -EINVAL;
391 } 391 }
392 392
@@ -646,6 +646,44 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
646 goto out; 646 goto out;
647} 647}
648 648
649int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
650 size_t nsegs)
651{
652 sector_t seg_start, seg_end;
653 sector_t start = 0, nblocks = 0;
654 unsigned int sects_per_block;
655 __u64 *sn;
656 int ret = 0;
657
658 sects_per_block = (1 << nilfs->ns_blocksize_bits) /
659 bdev_logical_block_size(nilfs->ns_bdev);
660 for (sn = segnump; sn < segnump + nsegs; sn++) {
661 nilfs_get_segment_range(nilfs, *sn, &seg_start, &seg_end);
662
663 if (!nblocks) {
664 start = seg_start;
665 nblocks = seg_end - seg_start + 1;
666 } else if (start + nblocks == seg_start) {
667 nblocks += seg_end - seg_start + 1;
668 } else {
669 ret = blkdev_issue_discard(nilfs->ns_bdev,
670 start * sects_per_block,
671 nblocks * sects_per_block,
672 GFP_NOFS,
673 DISCARD_FL_BARRIER);
674 if (ret < 0)
675 return ret;
676 nblocks = 0;
677 }
678 }
679 if (nblocks)
680 ret = blkdev_issue_discard(nilfs->ns_bdev,
681 start * sects_per_block,
682 nblocks * sects_per_block,
683 GFP_NOFS, DISCARD_FL_BARRIER);
684 return ret;
685}
686
649int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) 687int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
650{ 688{
651 struct inode *dat = nilfs_dat_inode(nilfs); 689 struct inode *dat = nilfs_dat_inode(nilfs);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 589786e33464..1ab974533697 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -29,6 +29,7 @@
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/blkdev.h> 30#include <linux/blkdev.h>
31#include <linux/backing-dev.h> 31#include <linux/backing-dev.h>
32#include <linux/slab.h>
32#include "sb.h" 33#include "sb.h"
33 34
34/* the_nilfs struct */ 35/* the_nilfs struct */
@@ -38,6 +39,7 @@ enum {
38 the latest checkpoint was loaded */ 39 the latest checkpoint was loaded */
39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ 40 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
40 THE_NILFS_GC_RUNNING, /* gc process is running */ 41 THE_NILFS_GC_RUNNING, /* gc process is running */
42 THE_NILFS_SB_DIRTY, /* super block is dirty */
41}; 43};
42 44
43/** 45/**
@@ -197,6 +199,7 @@ THE_NILFS_FNS(INIT, init)
197THE_NILFS_FNS(LOADED, loaded) 199THE_NILFS_FNS(LOADED, loaded)
198THE_NILFS_FNS(DISCONTINUED, discontinued) 200THE_NILFS_FNS(DISCONTINUED, discontinued)
199THE_NILFS_FNS(GC_RUNNING, gc_running) 201THE_NILFS_FNS(GC_RUNNING, gc_running)
202THE_NILFS_FNS(SB_DIRTY, sb_dirty)
200 203
201/* Minimum interval of periodical update of superblocks (in seconds) */ 204/* Minimum interval of periodical update of superblocks (in seconds) */
202#define NILFS_SB_FREQ 10 205#define NILFS_SB_FREQ 10
@@ -221,6 +224,7 @@ struct the_nilfs *find_or_create_nilfs(struct block_device *);
221void put_nilfs(struct the_nilfs *); 224void put_nilfs(struct the_nilfs *);
222int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); 225int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
223int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); 226int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
227int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
224int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); 228int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
225struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64); 229struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
226int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int); 230int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 037e878e03fc..fcc2f064af83 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/dcache.h> 19#include <linux/dcache.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/gfp.h>
21#include <linux/init.h> 22#include <linux/init.h>
22#include <linux/module.h> 23#include <linux/module.h>
23#include <linux/srcu.h> 24#include <linux/srcu.h>
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3165d85aada2..0399bcbe09c8 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -87,7 +87,6 @@
87#include <linux/kernel.h> 87#include <linux/kernel.h>
88#include <linux/module.h> 88#include <linux/module.h>
89#include <linux/mutex.h> 89#include <linux/mutex.h>
90#include <linux/slab.h>
91#include <linux/spinlock.h> 90#include <linux/spinlock.h>
92#include <linux/writeback.h> /* for inode_lock */ 91#include <linux/writeback.h> /* for inode_lock */
93 92
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index c9ee67b442e1..1afb0a10229f 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -121,7 +121,7 @@ static int idr_callback(int id, void *p, void *data)
121 if (warned) 121 if (warned)
122 return 0; 122 return 0;
123 123
124 warned = false; 124 warned = true;
125 entry = p; 125 entry = p;
126 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 126 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
127 127
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 8271cf05c957..472cdf29ef82 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -29,14 +29,12 @@
29#include <linux/init.h> /* module_init */ 29#include <linux/init.h> /* module_init */
30#include <linux/inotify.h> 30#include <linux/inotify.h>
31#include <linux/kernel.h> /* roundup() */ 31#include <linux/kernel.h> /* roundup() */
32#include <linux/magic.h> /* superblock magic number */
33#include <linux/mount.h> /* mntget */
34#include <linux/namei.h> /* LOOKUP_FOLLOW */ 32#include <linux/namei.h> /* LOOKUP_FOLLOW */
35#include <linux/path.h> /* struct path */
36#include <linux/sched.h> /* struct user */ 33#include <linux/sched.h> /* struct user */
37#include <linux/slab.h> /* struct kmem_cache */ 34#include <linux/slab.h> /* struct kmem_cache */
38#include <linux/syscalls.h> 35#include <linux/syscalls.h>
39#include <linux/types.h> 36#include <linux/types.h>
37#include <linux/anon_inodes.h>
40#include <linux/uaccess.h> 38#include <linux/uaccess.h>
41#include <linux/poll.h> 39#include <linux/poll.h>
42#include <linux/wait.h> 40#include <linux/wait.h>
@@ -45,8 +43,6 @@
45 43
46#include <asm/ioctls.h> 44#include <asm/ioctls.h>
47 45
48static struct vfsmount *inotify_mnt __read_mostly;
49
50/* these are configurable via /proc/sys/fs/inotify/ */ 46/* these are configurable via /proc/sys/fs/inotify/ */
51static int inotify_max_user_instances __read_mostly; 47static int inotify_max_user_instances __read_mostly;
52static int inotify_max_queued_events __read_mostly; 48static int inotify_max_queued_events __read_mostly;
@@ -552,7 +548,7 @@ retry:
552 548
553 spin_lock(&group->inotify_data.idr_lock); 549 spin_lock(&group->inotify_data.idr_lock);
554 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, 550 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
555 group->inotify_data.last_wd, 551 group->inotify_data.last_wd+1,
556 &tmp_ientry->wd); 552 &tmp_ientry->wd);
557 spin_unlock(&group->inotify_data.idr_lock); 553 spin_unlock(&group->inotify_data.idr_lock);
558 if (ret) { 554 if (ret) {
@@ -632,7 +628,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
632 628
633 spin_lock_init(&group->inotify_data.idr_lock); 629 spin_lock_init(&group->inotify_data.idr_lock);
634 idr_init(&group->inotify_data.idr); 630 idr_init(&group->inotify_data.idr);
635 group->inotify_data.last_wd = 1; 631 group->inotify_data.last_wd = 0;
636 group->inotify_data.user = user; 632 group->inotify_data.user = user;
637 group->inotify_data.fa = NULL; 633 group->inotify_data.fa = NULL;
638 634
@@ -645,9 +641,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
645{ 641{
646 struct fsnotify_group *group; 642 struct fsnotify_group *group;
647 struct user_struct *user; 643 struct user_struct *user;
648 struct file *filp; 644 int ret;
649 struct path path;
650 int fd, ret;
651 645
652 /* Check the IN_* constants for consistency. */ 646 /* Check the IN_* constants for consistency. */
653 BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC); 647 BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
@@ -656,10 +650,6 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
656 if (flags & ~(IN_CLOEXEC | IN_NONBLOCK)) 650 if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
657 return -EINVAL; 651 return -EINVAL;
658 652
659 fd = get_unused_fd_flags(flags & O_CLOEXEC);
660 if (fd < 0)
661 return fd;
662
663 user = get_current_user(); 653 user = get_current_user();
664 if (unlikely(atomic_read(&user->inotify_devs) >= 654 if (unlikely(atomic_read(&user->inotify_devs) >=
665 inotify_max_user_instances)) { 655 inotify_max_user_instances)) {
@@ -676,27 +666,14 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
676 666
677 atomic_inc(&user->inotify_devs); 667 atomic_inc(&user->inotify_devs);
678 668
679 path.mnt = inotify_mnt; 669 ret = anon_inode_getfd("inotify", &inotify_fops, group,
680 path.dentry = inotify_mnt->mnt_root; 670 O_RDONLY | flags);
681 path_get(&path); 671 if (ret >= 0)
682 filp = alloc_file(&path, FMODE_READ, &inotify_fops); 672 return ret;
683 if (!filp)
684 goto Enfile;
685 673
686 filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
687 filp->private_data = group;
688
689 fd_install(fd, filp);
690
691 return fd;
692
693Enfile:
694 ret = -ENFILE;
695 path_put(&path);
696 atomic_dec(&user->inotify_devs); 674 atomic_dec(&user->inotify_devs);
697out_free_uid: 675out_free_uid:
698 free_uid(user); 676 free_uid(user);
699 put_unused_fd(fd);
700 return ret; 677 return ret;
701} 678}
702 679
@@ -783,20 +760,6 @@ out:
783 return ret; 760 return ret;
784} 761}
785 762
786static int
787inotify_get_sb(struct file_system_type *fs_type, int flags,
788 const char *dev_name, void *data, struct vfsmount *mnt)
789{
790 return get_sb_pseudo(fs_type, "inotify", NULL,
791 INOTIFYFS_SUPER_MAGIC, mnt);
792}
793
794static struct file_system_type inotify_fs_type = {
795 .name = "inotifyfs",
796 .get_sb = inotify_get_sb,
797 .kill_sb = kill_anon_super,
798};
799
800/* 763/*
801 * inotify_user_setup - Our initialization function. Note that we cannnot return 764 * inotify_user_setup - Our initialization function. Note that we cannnot return
802 * error because we have compiled-in VFS hooks. So an (unlikely) failure here 765 * error because we have compiled-in VFS hooks. So an (unlikely) failure here
@@ -804,16 +767,6 @@ static struct file_system_type inotify_fs_type = {
804 */ 767 */
805static int __init inotify_user_setup(void) 768static int __init inotify_user_setup(void)
806{ 769{
807 int ret;
808
809 ret = register_filesystem(&inotify_fs_type);
810 if (unlikely(ret))
811 panic("inotify: register_filesystem returned %d!\n", ret);
812
813 inotify_mnt = kern_mount(&inotify_fs_type);
814 if (IS_ERR(inotify_mnt))
815 panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
816
817 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); 770 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
818 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); 771 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
819 772
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
deleted file mode 100644
index 37c11e194372..000000000000
--- a/fs/ntfs/ChangeLog
+++ /dev/null
@@ -1,1702 +0,0 @@
1ToDo/Notes:
2 - Find and fix bugs.
3 - The only places in the kernel where a file is resized are
4 ntfs_file_write*() and ntfs_truncate() for both of which i_mutex is
5 held. Just have to be careful in read-/writepage and other helpers
6 not running under i_mutex that we play nice. Also need to be careful
7 with initialized_size extension in ntfs_file_write*() and writepage.
8 UPDATE: The only things that need to be checked are the compressed
9 write and the other attribute resize/write cases like index
10 attributes, etc. For now none of these are implemented so are safe.
11 - Implement filling in of holes in aops.c::ntfs_writepage() and its
12 helpers.
13 - Implement mft.c::sync_mft_mirror_umount(). We currently will just
14 leave the volume dirty on umount if the final iput(vol->mft_ino)
15 causes a write of any mirrored mft records due to the mft mirror
16 inode having been discarded already. Whether this can actually ever
17 happen is unclear however so it is worth waiting until someone hits
18 the problem.
19
202.1.29 - Fix a deadlock at mount time.
21
22 - During mount the VFS holds s_umount lock on the superblock. So when
23 we try to empty the journal $LogFile contents by calling
24 ntfs_attr_set() when the machine does not have much memory and the
25 journal is large ntfs_attr_set() results in the VM trying to balance
26 dirty pages which in turn tries to that the s_umount lock and thus we
27 get a deadlock. The solution is to not use ntfs_attr_set() and
28 instead do the zeroing by hand at the block level rather than page
29 cache level.
30 - Fix sparse warnings.
31
322.1.28 - Fix a deadlock.
33
34 - Fix deadlock in fs/ntfs/inode.c::ntfs_put_inode(). Thanks to Sergey
35 Vlasov for the report and detailed analysis of the deadlock. The fix
36 involved getting rid of ntfs_put_inode() altogether and hence NTFS no
37 longer has a ->put_inode super operation.
38
392.1.27 - Various bug fixes and cleanups.
40
41 - Fix two compiler warnings on Alpha. Thanks to Andrew Morton for
42 reporting them.
43 - Fix an (innocent) off-by-one error in the runlist code.
44 - Fix a buggette in an "should be impossible" case handling where we
45 continued the attribute lookup loop instead of aborting it.
46 - Use buffer_migrate_page() for the ->migratepage function of all ntfs
47 address space operations.
48 - Fix comparison of $MFT and $MFTMirr to not bail out when there are
49 unused, invalid mft records which are the same in both $MFT and
50 $MFTMirr.
51 - Add support for sparse files which have a compression unit of 0.
52 - Remove all the make_bad_inode() calls. This should only be called
53 from read inode and new inode code paths.
54 - Limit name length in fs/ntfs/unistr.c::ntfs_nlstoucs() to maximum
55 allowed by NTFS, i.e. 255 Unicode characters, not including the
56 terminating NULL (which is not stored on disk).
57 - Improve comments on file attribute flags in fs/ntfs/layout.h.
58 - Fix a bug in fs/ntfs/inode.c::ntfs_read_locked_index_inode() where we
59 forgot to update a temporary variable so loading index inodes which
60 have an index allocation attribute failed.
61 - Add a missing call to flush_dcache_mft_record_page() in
62 fs/ntfs/inode.c::ntfs_write_inode().
63 - Handle the recently introduced -ENAMETOOLONG return value from
64 fs/ntfs/unistr.c::ntfs_nlstoucs() in fs/ntfs/namei.c::ntfs_lookup().
65 - Semaphore to mutex conversion. (Ingo Molnar)
66
672.1.26 - Minor bug fixes and updates.
68
69 - Fix a potential overflow in file.c where a cast to s64 was missing in
70 a left shift of a page index.
71 - The struct inode has had its i_sem semaphore changed to a mutex named
72 i_mutex.
73 - We have struct kmem_cache now so use it instead of the typedef
74 kmem_cache_t. (Pekka Enberg)
75 - Implement support for sector sizes above 512 bytes (up to the maximum
76 supported by NTFS which is 4096 bytes).
77 - Do more detailed reporting of why we cannot mount read-write by
78 special casing the VOLUME_MODIFIED_BY_CHKDSK flag.
79 - Miscellaneous updates to layout.h.
80 - Cope with attribute list attribute having invalid flags. Windows
81 copes with this and even chkdsk does not detect or fix this so we
82 have to cope with it, too. Thanks to Pawel Kot for reporting the
83 problem.
84
852.1.25 - (Almost) fully implement write(2) and truncate(2).
86
87 - Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and
88 {__,}ntfs_cluster_free() to also take an optional attribute search
89 context as argument. This allows calling these functions with the
90 mft record mapped. Update all callers.
91 - Fix potential deadlock in ntfs_mft_data_extend_allocation_nolock()
92 error handling by passing in the active search context when calling
93 ntfs_cluster_free().
94 - Change ntfs_cluster_alloc() to take an extra boolean parameter
95 specifying whether the cluster are being allocated to extend an
96 attribute or to fill a hole.
97 - Change ntfs_attr_make_non_resident() to call ntfs_cluster_alloc()
98 with @is_extension set to TRUE and remove the runlist terminator
99 fixup code as this is now done by ntfs_cluster_alloc().
100 - Change ntfs_attr_make_non_resident to take the attribute value size
101 as an extra parameter. This is needed since we need to know the size
102 before we can map the mft record and our callers always know it. The
103 reason we cannot simply read the size from the vfs inode i_size is
104 that this is not necessarily uptodate. This happens when
105 ntfs_attr_make_non_resident() is called in the ->truncate call path.
106 - Fix ntfs_attr_make_non_resident() to update the vfs inode i_blocks
107 which is zero for a resident attribute but should no longer be zero
108 once the attribute is non-resident as it then has real clusters
109 allocated.
110 - Add fs/ntfs/attrib.[hc]::ntfs_attr_extend_allocation(), a function to
111 extend the allocation of an attributes. Optionally, the data size,
112 but not the initialized size can be extended, too.
113 - Implement fs/ntfs/inode.[hc]::ntfs_truncate(). It only supports
114 uncompressed and unencrypted files and it never creates sparse files
115 at least for the moment (making a file sparse requires us to modify
116 its directory entries and we do not support directory operations at
117 the moment). Also, support for highly fragmented files, i.e. ones
118 whose data attribute is split across multiple extents, is severly
119 limited. When such a case is encountered, EOPNOTSUPP is returned.
120 - Enable ATTR_SIZE attribute changes in ntfs_setattr(). This completes
121 the initial implementation of file truncation. Now both open(2)ing
122 a file with the O_TRUNC flag and the {,f}truncate(2) system calls
123 will resize a file appropriately. The limitations are that only
124 uncompressed and unencrypted files are supported. Also, there is
125 only very limited support for highly fragmented files (the ones whose
126 $DATA attribute is split into multiple attribute extents).
127 - In attrib.c::ntfs_attr_set() call balance_dirty_pages_ratelimited()
128 and cond_resched() in the main loop as we could be dirtying a lot of
129 pages and this ensures we play nice with the VM and the system as a
130 whole.
131 - Implement file operations ->write, ->aio_write, ->writev for regular
132 files. This replaces the old use of generic_file_write(), et al and
133 the address space operations ->prepare_write and ->commit_write.
134 This means that both sparse and non-sparse (unencrypted and
135 uncompressed) files can now be extended using the normal write(2)
136 code path. There are two limitations at present and these are that
137 we never create sparse files and that we only have limited support
138 for highly fragmented files, i.e. ones whose data attribute is split
139 across multiple extents. When such a case is encountered,
140 EOPNOTSUPP is returned.
141 - $EA attributes can be both resident and non-resident.
142 - Use %z for size_t to fix compilation warnings. (Andrew Morton)
143 - Fix compilation warnings with gcc-4.0.2 on SUSE 10.0.
144 - Document extended attribute ($EA) NEED_EA flag. (Based on libntfs
145 patch by Yura Pakhuchiy.)
146
1472.1.24 - Lots of bug fixes and support more clean journal states.
148
149 - Support journals ($LogFile) which have been modified by chkdsk. This
150 means users can boot into Windows after we marked the volume dirty.
151 The Windows boot will run chkdsk and then reboot. The user can then
152 immediately boot into Linux rather than having to do a full Windows
153 boot first before rebooting into Linux and we will recognize such a
154 journal and empty it as it is clean by definition. Note, this only
155 works if chkdsk left the journal in an obviously clean state.
156 - Support journals ($LogFile) with only one restart page as well as
157 journals with two different restart pages. We sanity check both and
158 either use the only sane one or the more recent one of the two in the
159 case that both are valid.
160 - Add fs/ntfs/malloc.h::ntfs_malloc_nofs_nofail() which is analogous to
161 ntfs_malloc_nofs() but it performs allocations with __GFP_NOFAIL and
162 hence cannot fail.
163 - Use ntfs_malloc_nofs_nofail() in the two critical regions in
164 fs/ntfs/runlist.c::ntfs_runlists_merge(). This means we no longer
165 need to panic() if the allocation fails as it now cannot fail.
166 - Fix two nasty runlist merging bugs that had gone unnoticed so far.
167 Thanks to Stefano Picerno for the bug report.
168 - Remove two bogus BUG_ON()s from fs/ntfs/mft.c.
169 - Fix handling of valid but empty mapping pairs array in
170 fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress().
171 - Report unrepresentable inodes during ntfs_readdir() as KERN_WARNING
172 messages and include the inode number. Thanks to Yura Pakhuchiy for
173 pointing this out.
174 - Change ntfs_rl_truncate_nolock() to throw away the runlist if the new
175 length is zero.
176 - Add runlist.[hc]::ntfs_rl_punch_nolock() which punches a caller
177 specified hole into a runlist.
178 - Fix a bug in fs/ntfs/index.c::ntfs_index_lookup(). When the returned
179 index entry is in the index root, we forgot to set the @ir pointer in
180 the index context. Thanks to Yura Pakhuchiy for finding this bug.
181 - Remove bogus setting of PageError in ntfs_read_compressed_block().
182 - Add fs/ntfs/attrib.[hc]::ntfs_resident_attr_value_resize().
183 - Fix a bug in ntfs_map_runlist_nolock() where we forgot to protect
184 access to the allocated size in the ntfs inode with the size lock.
185 - Fix ntfs_attr_vcn_to_lcn_nolock() and ntfs_attr_find_vcn_nolock() to
186 return LCN_ENOENT when there is no runlist and the allocated size is
187 zero.
188 - Fix load_attribute_list() to handle the case of a NULL runlist.
189 - Fix handling of sparse attributes in ntfs_attr_make_non_resident().
190 - Add BUG() checks to ntfs_attr_make_non_resident() and ntfs_attr_set()
191 to ensure that these functions are never called for compressed or
192 encrypted attributes.
193 - Fix cluster (de)allocators to work when the runlist is NULL and more
194 importantly to take a locked runlist rather than them locking it
195 which leads to lock reversal.
196 - Truncate {a,c,m}time to the ntfs supported time granularity when
197 updating the times in the inode in ntfs_setattr().
198 - Fixup handling of sparse, compressed, and encrypted attributes in
199 fs/ntfs/inode.c::ntfs_read_locked_{,attr_,index_}inode(),
200 fs/ntfs/aops.c::ntfs_{read,write}page().
201 - Make ntfs_write_block() not instantiate sparse blocks if they contain
202 only zeroes.
203 - Optimize fs/ntfs/aops.c::ntfs_write_block() by extending the page
204 lock protection over the buffer submission for i/o which allows the
205 removal of the get_bh()/put_bh() pairs for each buffer.
206 - Fix fs/ntfs/aops.c::ntfs_{read,write}_block() to handle the case
207 where a concurrent truncate has truncated the runlist under our feet.
208 - Fix page_has_buffers()/page_buffers() handling in fs/ntfs/aops.c.
209 - In fs/ntfs/aops.c::ntfs_end_buffer_async_read(), use a bit spin lock
210 in the first buffer head instead of a driver global spin lock to
211 improve scalability.
212 - Minor fix to error handling and error message display in
213 fs/ntfs/aops.c::ntfs_prepare_nonresident_write().
214 - Change the mount options {u,f,d}mask to always parse the number as
215 an octal number to conform to how chmod(1) works, too. Thanks to
216 Giuseppe Bilotta and Horst von Brand for pointing out the errors of
217 my ways.
218 - Fix various bugs in the runlist merging code. (Based on libntfs
219 changes by Richard Russon.)
220 - Fix sparse warnings that have crept in over time.
221 - Change ntfs_cluster_free() to require a write locked runlist on entry
222 since we otherwise get into a lock reversal deadlock if a read locked
223 runlist is passed in. In the process also change it to take an ntfs
224 inode instead of a vfs inode as parameter.
225 - Fix the definition of the CHKD ntfs record magic. It had an off by
226 two error causing it to be CHKB instead of CHKD.
227 - Fix a stupid bug in __ntfs_bitmap_set_bits_in_run() which caused the
228 count to become negative and hence we had a wild memset() scribbling
229 all over the system's ram.
230
2312.1.23 - Implement extension of resident files and make writing safe as well as
232 many bug fixes, cleanups, and enhancements...
233
234 - Add printk rate limiting for ntfs_warning() and ntfs_error() when
235 compiled without debug. This avoids a possible denial of service
236 attack. Thanks to Carl-Daniel Hailfinger from SuSE for pointing this
237 out.
238 - Fix compilation warnings on ia64. (Randy Dunlap)
239 - Use i_size_{read,write}() instead of reading i_size by hand and cache
240 the value where apropriate.
241 - Add size_lock to the ntfs_inode structure. This is an rw spinlock
242 and it locks against access to the inode sizes. Note, ->size_lock
243 is also accessed from irq context so you must use the _irqsave and
244 _irqrestore lock and unlock functions, respectively. Protect all
245 accesses to allocated_size, initialized_size, and compressed_size.
246 - Minor optimization to fs/ntfs/super.c::ntfs_statfs() and its helpers.
247 - Implement extension of resident files in the regular file write code
248 paths (fs/ntfs/aops.c::ntfs_{prepare,commit}_write()). At present
249 this only works until the data attribute becomes too big for the mft
250 record after which we abort the write returning -EOPNOTSUPP from
251 ntfs_prepare_write().
252 - Add disable_sparse mount option together with a per volume sparse
253 enable bit which is set appropriately and a per inode sparse disable
254 bit which is preset on some system file inodes as appropriate.
255 - Enforce that sparse support is disabled on NTFS volumes pre 3.0.
256 - Fix a bug in fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress() in
257 the creation of the unmapped runlist element for the base attribute
258 extent.
259 - Split ntfs_map_runlist() into ntfs_map_runlist() and a non-locking
260 helper ntfs_map_runlist_nolock() which is used by ntfs_map_runlist().
261 This allows us to map runlist fragments with the runlist lock already
262 held without having to drop and reacquire it around the call. Adapt
263 all callers.
264 - Change ntfs_find_vcn() to ntfs_find_vcn_nolock() which takes a locked
265 runlist. This allows us to find runlist elements with the runlist
266 lock already held without having to drop and reacquire it around the
267 call. Adapt all callers.
268 - Change time to u64 in time.h::ntfs2utc() as it otherwise generates a
269 warning in the do_div() call on sparc32. Thanks to Meelis Roos for
270 the report and analysis of the warning.
271 - Fix a nasty runlist merge bug when merging two holes.
272 - Set the ntfs_inode->allocated_size to the real allocated size in the
273 mft record for resident attributes (fs/ntfs/inode.c).
274 - Small readability cleanup to use "a" instead of "ctx->attr"
275 everywhere (fs/ntfs/inode.c).
276 - Make fs/ntfs/namei.c::ntfs_get_{parent,dentry} static and move the
277 definition of ntfs_export_ops from fs/ntfs/super.c to namei.c. Also,
278 declare ntfs_export_ops in fs/ntfs/ntfs.h.
279 - Correct sparse file handling. The compressed values need to be
280 checked and set in the ntfs inode as done for compressed files and
281 the compressed size needs to be used for vfs inode->i_blocks instead
282 of the allocated size, again, as done for compressed files.
283 - Add AT_EA in addition to AT_DATA to whitelist for being allowed to be
284 non-resident in fs/ntfs/attrib.c::ntfs_attr_can_be_non_resident().
285 - Add fs/ntfs/attrib.c::ntfs_attr_vcn_to_lcn_nolock() used by the new
286 write code.
287 - Fix bug in fs/ntfs/attrib.c::ntfs_find_vcn_nolock() where after
288 dropping the read lock and taking the write lock we were not checking
289 whether someone else did not already do the work we wanted to do.
290 - Rename fs/ntfs/attrib.c::ntfs_find_vcn_nolock() to
291 ntfs_attr_find_vcn_nolock() and update all callers.
292 - Add fs/ntfs/attrib.[hc]::ntfs_attr_make_non_resident().
293 - Fix sign of various error return values to be negative in
294 fs/ntfs/lcnalloc.c.
295 - Modify ->readpage and ->writepage (fs/ntfs/aops.c) so they detect and
296 handle the case where an attribute is converted from resident to
297 non-resident by a concurrent file write.
298 - Remove checks for NULL before calling kfree() since kfree() does the
299 checking itself. (Jesper Juhl)
300 - Some utilities modify the boot sector but do not update the checksum.
301 Thus, relax the checking in fs/ntfs/super.c::is_boot_sector_ntfs() to
302 only emit a warning when the checksum is incorrect rather than
303 refusing the mount. Thanks to Bernd Casimir for pointing this
304 problem out.
305 - Update attribute definition handling.
306 - Add NTFS_MAX_CLUSTER_SIZE and NTFS_MAX_PAGES_PER_CLUSTER constants.
307 - Use NTFS_MAX_CLUSTER_SIZE in super.c instead of hard coding 0x10000.
308 - Use MAX_BUF_PER_PAGE instead of variable sized array allocation for
309 better code generation and one less sparse warning in fs/ntfs/aops.c.
310 - Remove spurious void pointer casts from fs/ntfs/. (Pekka Enberg)
311 - Use C99 style structure initialization after memory allocation where
312 possible (fs/ntfs/{attrib.c,index.c,super.c}). Thanks to Al Viro and
313 Pekka Enberg.
314 - Stamp the transaction log ($UsnJrnl), aka user space journal, if it
315 is active on the volume and we are mounting read-write or remounting
316 from read-only to read-write.
317 - Fix a bug in address space operations error recovery code paths where
318 if the runlist was not mapped at all and a mapping error occured we
319 would leave the runlist locked on exit to the function so that the
320 next access to the same file would try to take the lock and deadlock.
321 - Detect the case when Windows has been suspended to disk on the volume
322 to be mounted and if this is the case do not allow (re)mounting
323 read-write. This is done by parsing hiberfil.sys if present.
324 - Fix several occurences of a bug where we would perform 'var & ~const'
325 with a 64-bit variable and a int, i.e. 32-bit, constant. This causes
326 the higher order 32-bits of the 64-bit variable to be zeroed. To fix
327 this cast the 'const' to the same 64-bit type as 'var'.
328 - Change the runlist terminator of the newly allocated cluster(s) to
329 LCN_ENOENT in ntfs_attr_make_non_resident(). Otherwise the runlist
330 code gets confused.
331 - Add an extra parameter @last_vcn to ntfs_get_size_for_mapping_pairs()
332 and ntfs_mapping_pairs_build() to allow the runlist encoding to be
333 partial which is desirable when filling holes in sparse attributes.
334 Update all callers.
335 - Change ntfs_map_runlist_nolock() to only decompress the mapping pairs
336 if the requested vcn is inside it. Otherwise we get into problems
337 when we try to map an out of bounds vcn because we then try to map
338 the already mapped runlist fragment which causes
339 ntfs_mapping_pairs_decompress() to fail and return error. Update
340 ntfs_attr_find_vcn_nolock() accordingly.
341 - Fix a nasty deadlock that appeared in recent kernels.
342 The situation: VFS inode X on a mounted ntfs volume is dirty. For
343 same inode X, the ntfs_inode is dirty and thus corresponding on-disk
344 inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging
345 to the table of inodes, i.e. $MFT, inode 0.
346 What happens:
347 Process 1: sys_sync()/umount()/whatever... calls
348 __sync_single_inode() for $MFT -> do_writepages() -> write_page for
349 the dirty page containing the on-disk inode X, the page is now locked
350 -> ntfs_write_mst_block() which clears PageUptodate() on the page to
351 prevent anyone else getting hold of it whilst it does the write out.
352 This is necessary as the on-disk inode needs "fixups" applied before
353 the write to disk which are removed again after the write and
354 PageUptodate is then set again. It then analyses the page looking
355 for dirty on-disk inodes and when it finds one it calls
356 ntfs_may_write_mft_record() to see if it is safe to write this
357 on-disk inode. This then calls ilookup5() to check if the
358 corresponding VFS inode is in icache(). This in turn calls ifind()
359 which waits on the inode lock via wait_on_inode whilst holding the
360 global inode_lock.
361 Process 2: pdflush results in a call to __sync_single_inode for the
362 same VFS inode X on the ntfs volume. This locks the inode (I_LOCK)
363 then calls write-inode -> ntfs_write_inode -> map_mft_record() ->
364 read_cache_page() for the page (in page cache of table of inodes
365 $MFT, inode 0) containing the on-disk inode. This page has
366 PageUptodate() clear because of Process 1 (see above) so
367 read_cache_page() blocks when it tries to take the page lock for the
368 page so it can call ntfs_read_page().
369 Thus Process 1 is holding the page lock on the page containing the
370 on-disk inode X and it is waiting on the inode X to be unlocked in
371 ifind() so it can write the page out and then unlock the page.
372 And Process 2 is holding the inode lock on inode X and is waiting for
373 the page to be unlocked so it can call ntfs_readpage() or discover
374 that Process 1 set PageUptodate() again and use the page.
375 Thus we have a deadlock due to ifind() waiting on the inode lock.
376 The solution: The fix is to use the newly introduced
377 ilookup5_nowait() which does not wait on the inode's lock and hence
378 avoids the deadlock. This is safe as we do not care about the VFS
379 inode and only use the fact that it is in the VFS inode cache and the
380 fact that the vfs and ntfs inodes are one struct in memory to find
381 the ntfs inode in memory if present. Also, the ntfs inode has its
382 own locking so it does not matter if the vfs inode is locked.
383 - Fix bug in mft record writing where we forgot to set the device in
384 the buffers when mapping them after the VM had discarded them.
385 Thanks to Martin MOKREJÃ… for the bug report.
386
3872.1.22 - Many bug and race fixes and error handling improvements.
388
389 - Improve error handling in fs/ntfs/inode.c::ntfs_truncate().
390 - Change fs/ntfs/inode.c::ntfs_truncate() to return an error code
391 instead of void and provide a helper ntfs_truncate_vfs() for the
392 vfs ->truncate method.
393 - Add a new ntfs inode flag NInoTruncateFailed() and modify
394 fs/ntfs/inode.c::ntfs_truncate() to set and clear it appropriately.
395 - Fix min_size and max_size definitions in ATTR_DEF structure in
396 fs/ntfs/layout.h to be signed.
397 - Add attribute definition handling helpers to fs/ntfs/attrib.[hc]:
398 ntfs_attr_size_bounds_check(), ntfs_attr_can_be_non_resident(), and
399 ntfs_attr_can_be_resident(), which in turn use the new private helper
400 ntfs_attr_find_in_attrdef().
401 - In fs/ntfs/aops.c::mark_ntfs_record_dirty(), take the
402 mapping->private_lock around the dirtying of the buffer heads
403 analagous to the way it is done in __set_page_dirty_buffers().
404 - Ensure the mft record size does not exceed the PAGE_CACHE_SIZE at
405 mount time as this cannot work with the current implementation.
406 - Check for location of attribute name and improve error handling in
407 general in fs/ntfs/inode.c::ntfs_read_locked_inode() and friends.
408 - In fs/ntfs/aops.c::ntfs_writepage(), if the page is fully outside
409 i_size, i.e. race with truncate, invalidate the buffers on the page
410 so that they become freeable and hence the page does not leak.
411 - Remove unused function fs/ntfs/runlist.c::ntfs_rl_merge(). (Adrian
412 Bunk)
413 - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_find() that resulted in
414 a NULL pointer dereference in the error code path when a corrupt
415 attribute was found. (Thanks to Domen Puncer for the bug report.)
416 - Add MODULE_VERSION() to fs/ntfs/super.c.
417 - Make several functions and variables static. (Adrian Bunk)
418 - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() so it allocates
419 buffers for the page if they are not present and then marks the
420 buffers belonging to the ntfs record dirty. This causes the buffers
421 to become busy and hence they are safe from removal until the page
422 has been written out.
423 - Fix stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find() in the
424 error handling code path that resulted in a BUG() due to trying to
425 unmap an extent mft record when the mapping of it had failed and it
426 thus was not mapped. (Thanks to Ken MacFerrin for the bug report.)
427 - Drop the runlist lock after the vcn has been read in
428 fs/ntfs/lcnalloc.c::__ntfs_cluster_free().
429 - Rewrite handling of multi sector transfer errors. We now do not set
430 PageError() when such errors are detected in the async i/o handler
431 fs/ntfs/aops.c::ntfs_end_buffer_async_read(). All users of mst
432 protected attributes now check the magic of each ntfs record as they
433 use it and act appropriately. This has the effect of making errors
434 granular per ntfs record rather than per page which solves the case
435 where we cannot access any of the ntfs records in a page when a
436 single one of them had an mst error. (Thanks to Ken MacFerrin for
437 the bug report.)
438 - Fix error handling in fs/ntfs/quota.c::ntfs_mark_quotas_out_of_date()
439 where we failed to release i_mutex on the $Quota/$Q attribute inode.
440 - Fix bug in handling of bad inodes in fs/ntfs/namei.c::ntfs_lookup().
441 - Add mapping of unmapped buffers to all remaining code paths, i.e.
442 fs/ntfs/aops.c::ntfs_write_mst_block(), mft.c::ntfs_sync_mft_mirror(),
443 and write_mft_record_nolock(). From now on we require that the
444 complete runlist for the mft mirror is always mapped into memory.
445 - Add creation of buffers to fs/ntfs/mft.c::ntfs_sync_mft_mirror().
446 - Improve error handling in fs/ntfs/aops.c::ntfs_{read,write}_block().
447 - Cleanup fs/ntfs/aops.c::ntfs_{read,write}page() since we know that a
448 resident attribute will be smaller than a page which makes the code
449 simpler. Also make the code more tolerant to concurrent ->truncate.
450
4512.1.21 - Fix some races and bugs, rewrite mft write code, add mft allocator.
452
453 - Implement extent mft record deallocation
454 fs/ntfs/mft.c::ntfs_extent_mft_record_free().
455 - Splitt runlist related functions off from attrib.[hc] to runlist.[hc].
456 - Add vol->mft_data_pos and initialize it at mount time.
457 - Rename init_runlist() to ntfs_init_runlist(), ntfs_vcn_to_lcn() to
458 ntfs_rl_vcn_to_lcn(), decompress_mapping_pairs() to
459 ntfs_mapping_pairs_decompress(), ntfs_merge_runlists() to
460 ntfs_runlists_merge() and adapt all callers.
461 - Add fs/ntfs/runlist.[hc]::ntfs_get_nr_significant_bytes(),
462 ntfs_get_size_for_mapping_pairs(), ntfs_write_significant_bytes(),
463 and ntfs_mapping_pairs_build(), adapted from libntfs.
464 - Make fs/ntfs/lcnalloc.c::ntfs_cluster_free_from_rl_nolock() not
465 static and add a declaration for it to lcnalloc.h.
466 - Add fs/ntfs/lcnalloc.h::ntfs_cluster_free_from_rl() which is a static
467 inline wrapper for ntfs_cluster_free_from_rl_nolock() which takes the
468 cluster bitmap lock for the duration of the call.
469 - Add fs/ntfs/attrib.[hc]::ntfs_attr_record_resize().
470 - Implement the equivalent of memset() for an ntfs attribute in
471 fs/ntfs/attrib.[hc]::ntfs_attr_set() and switch
472 fs/ntfs/logfile.c::ntfs_empty_logfile() to using it.
473 - Remove unnecessary casts from LCN_* constants.
474 - Implement fs/ntfs/runlist.c::ntfs_rl_truncate_nolock().
475 - Add MFT_RECORD_OLD as a copy of MFT_RECORD in fs/ntfs/layout.h and
476 change MFT_RECORD to contain the NTFS 3.1+ specific fields.
477 - Add a helper function fs/ntfs/aops.c::mark_ntfs_record_dirty() which
478 marks all buffers belonging to an ntfs record dirty, followed by
479 marking the page the ntfs record is in dirty and also marking the vfs
480 inode containing the ntfs record dirty (I_DIRTY_PAGES).
481 - Switch fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to using the
482 new helper fs/ntfs/aops.c::mark_ntfs_record_dirty() and remove the no
483 longer needed fs/ntfs/index.[hc]::__ntfs_index_entry_mark_dirty().
484 - Move ntfs_{un,}map_page() from ntfs.h to aops.h and fix resulting
485 include errors.
486 - Move the typedefs for runlist_element and runlist from types.h to
487 runlist.h and fix resulting include errors.
488 - Remove unused {__,}format_mft_record() from fs/ntfs/mft.c.
489 - Modify fs/ntfs/mft.c::__mark_mft_record_dirty() to use the helper
490 mark_ntfs_record_dirty() which also changes the behaviour in that we
491 now set the buffers belonging to the mft record dirty as well as the
492 page itself.
493 - Update fs/ntfs/mft.c::write_mft_record_nolock() and sync_mft_mirror()
494 to cope with the fact that there now are dirty buffers in mft pages.
495 - Update fs/ntfs/inode.c::ntfs_write_inode() to also use the helper
496 mark_ntfs_record_dirty() and thus to set the buffers belonging to the
497 mft record dirty as well as the page itself.
498 - Fix compiler warnings on x86-64 in fs/ntfs/dir.c. (Randy Dunlap,
499 slightly modified by me)
500 - Add fs/ntfs/mft.c::try_map_mft_record() which fails with -EALREADY if
501 the mft record is already locked and otherwise behaves the same way
502 as fs/ntfs/mft.c::map_mft_record().
503 - Modify fs/ntfs/mft.c::write_mft_record_nolock() so that it only
504 writes the mft record if the buffers belonging to it are dirty.
505 Otherwise we assume that it was written out by other means already.
506 - Attempting to write outside initialized size is _not_ a bug so remove
507 the bug check from fs/ntfs/aops.c::ntfs_write_mst_block(). It is in
508 fact required to write outside initialized size when preparing to
509 extend the initialized size.
510 - Map the page instead of using page_address() before writing to it in
511 fs/ntfs/aops.c::ntfs_mft_writepage().
512 - Provide exclusion between opening an inode / mapping an mft record
513 and accessing the mft record in fs/ntfs/mft.c::ntfs_mft_writepage()
514 by setting the page not uptodate throughout ntfs_mft_writepage().
515 - Clear the page uptodate flag in fs/ntfs/aops.c::ntfs_write_mst_block()
516 to ensure noone can see the page whilst the mst fixups are applied.
517 - Add the helper fs/ntfs/mft.c::ntfs_may_write_mft_record() which
518 checks if an mft record may be written out safely obtaining any
519 necessary locks in the process. This is used by
520 fs/ntfs/aops.c::ntfs_write_mst_block().
521 - Modify fs/ntfs/aops.c::ntfs_write_mst_block() to also work for
522 writing mft records and improve its error handling in the process.
523 Now if any of the records in the page fail to be written out, all
524 other records will be written out instead of aborting completely.
525 - Remove ntfs_mft_aops and update all users to use ntfs_mst_aops.
526 - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to set the
527 ntfs_mst_aops for all inodes which are NInoMstProtected() and
528 ntfs_aops for all other inodes.
529 - Rename fs/ntfs/mft.c::sync_mft_mirror{,_umount}() to
530 ntfs_sync_mft_mirror{,_umount}() and change their parameters so they
531 no longer require an ntfs inode to be present. Update all callers.
532 - Cleanup the error handling in fs/ntfs/mft.c::ntfs_sync_mft_mirror().
533 - Clear the page uptodate flag in fs/ntfs/mft.c::ntfs_sync_mft_mirror()
534 to ensure noone can see the page whilst the mst fixups are applied.
535 - Remove the no longer needed fs/ntfs/mft.c::ntfs_mft_writepage() and
536 fs/ntfs/mft.c::try_map_mft_record().
537 - Fix callers of fs/ntfs/aops.c::mark_ntfs_record_dirty() to call it
538 with the ntfs inode which contains the page rather than the ntfs
539 inode the mft record of which is in the page.
540 - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by moving the
541 index inode bitmap inode release code from there to
542 fs/ntfs/inode.c::ntfs_clear_big_inode(). (Thanks to Christoph
543 Hellwig for spotting this.)
544 - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by taking the
545 inode semaphore around the code that sets ni->itype.index.bmp_ino to
546 NULL and reorganize the code to optimize it a bit. (Thanks to
547 Christoph Hellwig for spotting this.)
548 - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() to no longer take the
549 ntfs inode as a parameter as this is confusing and misleading and the
550 needed ntfs inode is available via NTFS_I(page->mapping->host).
551 Adapt all callers to this change.
552 - Modify fs/ntfs/mft.c::write_mft_record_nolock() and
553 fs/ntfs/aops.c::ntfs_write_mst_block() to only check the dirty state
554 of the first buffer in a record and to take this as the ntfs record
555 dirty state. We cannot look at the dirty state for subsequent
556 buffers because we might be racing with
557 fs/ntfs/aops.c::mark_ntfs_record_dirty().
558 - Move the static inline ntfs_init_big_inode() from fs/ntfs/inode.c to
559 inode.h and make fs/ntfs/inode.c::__ntfs_init_inode() non-static and
560 add a declaration for it to inode.h. Fix some compilation issues
561 that resulted due to #includes and header file interdependencies.
562 - Simplify setup of i_mode in fs/ntfs/inode.c::ntfs_read_locked_inode().
563 - Add helpers fs/ntfs/layout.h::MK_MREF() and MK_LE_MREF().
564 - Modify fs/ntfs/mft.c::map_extent_mft_record() to only verify the mft
565 record sequence number if it is specified (i.e. not zero).
566 - Add fs/ntfs/mft.[hc]::ntfs_mft_record_alloc() and various helper
567 functions used by it.
568 - Update Documentation/filesystems/ntfs.txt with instructions on how to
569 use the Device-Mapper driver with NTFS ftdisk/LDM raid. This removes
570 the linear raid problem with the Software RAID / MD driver when one
571 or more of the devices has an odd number of sectors.
572
5732.1.20 - Fix two stupid bugs introduced in 2.1.18 release.
574
575 - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_reinit_search_ctx()
576 where we did not clear ctx->al_entry but it was still set due to
577 changes in ntfs_attr_lookup() and ntfs_external_attr_find() in
578 particular.
579 - Fix another stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find()
580 where we forgot to unmap the extent mft record when we had finished
581 enumerating an attribute which caused a bug check to trigger when the
582 VFS calls ->clear_inode.
583
5842.1.19 - Many cleanups, improvements, and a minor bug fix.
585
586 - Update ->setattr (fs/ntfs/inode.c::ntfs_setattr()) to refuse to
587 change the uid, gid, and mode of an inode as we do not support NTFS
588 ACLs yet.
589 - Remove BKL use from ntfs_setattr() syncing up with the rest of the
590 kernel.
591 - Get rid of the ugly transparent union in fs/ntfs/dir.c::ntfs_readdir()
592 and ntfs_filldir() as per suggestion from Al Viro.
593 - Change '\0' and L'\0' to simply 0 as per advice from Linus Torvalds.
594 - Update ->truncate (fs/ntfs/inode.c::ntfs_truncate()) to check if the
595 inode size has changed and to only output an error if so.
596 - Rename fs/ntfs/attrib.h::attribute_value_length() to ntfs_attr_size().
597 - Add le{16,32,64} as well as sle{16,32,64} data types to
598 fs/ntfs/types.h.
599 - Change ntfschar to be le16 instead of u16 in fs/ntfs/types.h.
600 - Add le versions of VCN, LCN, and LSN called leVCN, leLCN, and leLSN,
601 respectively, to fs/ntfs/types.h.
602 - Update endianness conversion macros in fs/ntfs/endian.h to use the
603 new types as appropriate.
604 - Do proper type casting when using sle64_to_cpup() in fs/ntfs/dir.c
605 and index.c.
606 - Add leMFT_REF data type to fs/ntfs/layout.h.
607 - Update all NTFS header files with the new little endian data types.
608 Affected files are fs/ntfs/layout.h, logfile.h, and time.h.
609 - Do proper type casting when using ntfs_is_*_recordp() in
610 fs/ntfs/logfile.c, mft.c, and super.c.
611 - Fix all the sparse bitwise warnings. Had to change all the typedef
612 enums storing little endian values to simple enums plus a typedef for
613 the datatype to make sparse happy.
614 - Fix a bug found by the new sparse bitwise warnings where the default
615 upcase table was defined as a pointer to wchar_t rather than ntfschar
616 in fs/ntfs/ntfs.h and super.c.
617 - Change {const_,}cpu_to_le{16,32}(0) to just 0 as suggested by Al Viro.
618
6192.1.18 - Fix scheduling latencies at mount time as well as an endianness bug.
620
621 - Remove vol->nr_mft_records as it was pretty meaningless and optimize
622 the calculation of total/free inodes as used by statfs().
623 - Fix scheduling latencies in ntfs_fill_super() by dropping the BKL
624 because the code itself is using the ntfs_lock semaphore which
625 provides safe locking. (Ingo Molnar)
626 - Fix a potential bug in fs/ntfs/mft.c::map_extent_mft_record() that
627 could occur in the future for when we start closing/freeing extent
628 inodes if we don't set base_ni->ext.extent_ntfs_inos to NULL after
629 we free it.
630 - Rename {find,lookup}_attr() to ntfs_attr_{find,lookup}() as well as
631 find_external_attr() to ntfs_external_attr_find() to cleanup the
632 namespace a bit and to be more consistent with libntfs.
633 - Rename {{re,}init,get,put}_attr_search_ctx() to
634 ntfs_attr_{{re,}init,get,put}_search_ctx() as well as the type
635 attr_search_context to ntfs_attr_search_ctx.
636 - Force use of ntfs_attr_find() in ntfs_attr_lookup() when searching
637 for the attribute list attribute itself.
638 - Fix endianness bug in ntfs_external_attr_find().
639 - Change ntfs_{external_,}attr_find() to return 0 on success, -ENOENT
640 if the attribute is not found, and -EIO on real error. In the case
641 of -ENOENT, the search context is updated to describe the attribute
642 before which the attribute being searched for would need to be
643 inserted if such an action were to be desired and in the case of
644 ntfs_external_attr_find() the search context is also updated to
645 indicate the attribute list entry before which the attribute list
646 entry of the attribute being searched for would need to be inserted
647 if such an action were to be desired. Also make ntfs_find_attr()
648 static and remove its prototype from attrib.h as it is not used
649 anywhere other than attrib.c. Update ntfs_attr_lookup() and all
650 callers of ntfs_{external,}attr_{find,lookup}() for the new return
651 values.
652 - Minor cleanup of fs/ntfs/inode.c::ntfs_init_locked_inode().
653
6542.1.17 - Fix bugs in mount time error code paths and other updates.
655
656 - Implement bitmap modification code (fs/ntfs/bitmap.[hc]). This
657 includes functions to set/clear a single bit or a run of bits.
658 - Add fs/ntfs/attrib.[hc]::ntfs_find_vcn() which returns the locked
659 runlist element containing a particular vcn. It also takes care of
660 mapping any needed runlist fragments.
661 - Implement cluster (de-)allocation code (fs/ntfs/lcnalloc.[hc]).
662 - Load attribute definition table from $AttrDef at mount time.
663 - Fix bugs in mount time error code paths involving (de)allocation of
664 the default and volume upcase tables.
665 - Remove ntfs_nr_mounts as it is no longer used.
666
6672.1.16 - Implement access time updates, file sync, async io, and read/writev.
668
669 - Add support for readv/writev and aio_read/aio_write (fs/ntfs/file.c).
670 This is done by setting the appropriate file operations pointers to
671 the generic helper functions provided by mm/filemap.c.
672 - Implement fsync, fdatasync, and msync both for files (fs/ntfs/file.c)
673 and directories (fs/ntfs/dir.c).
674 - Add support for {a,m,c}time updates to inode.c::ntfs_write_inode().
675 Note, except for the root directory and any other system files opened
676 by the user, the system files will not have their access times
677 updated as they are only accessed at the inode level an hence the
678 file level functions which cause the times to be updated are never
679 invoked.
680
6812.1.15 - Invalidate quotas when (re)mounting read-write.
682
683 - Add new element itype.index.collation_rule to the ntfs inode
684 structure and set it appropriately in ntfs_read_locked_inode().
685 - Implement a new inode type "index" to allow efficient access to the
686 indices found in various system files and adapt inode handling
687 accordingly (fs/ntfs/inode.[hc]). An index inode is essentially an
688 attribute inode (NInoAttr() is true) with an attribute type of
689 AT_INDEX_ALLOCATION. As such, it is no longer allowed to call
690 ntfs_attr_iget() with an attribute type of AT_INDEX_ALLOCATION as
691 there would be no way to distinguish between normal attribute inodes
692 and index inodes. The function to obtain an index inode is
693 ntfs_index_iget() and it uses the helper function
694 ntfs_read_locked_index_inode(). Note, we do not overload
695 ntfs_attr_iget() as indices consist of multiple attributes so using
696 ntfs_attr_iget() to obtain an index inode would be confusing.
697 - Ensure that there is no overflow when doing page->index <<
698 PAGE_CACHE_SHIFT by casting page->index to s64 in fs/ntfs/aops.c.
699 - Use atomic kmap instead of kmap() in fs/ntfs/aops.c::ntfs_read_page()
700 and ntfs_read_block().
701 - Use case sensitive attribute lookups instead of case insensitive ones.
702 - Lock all page cache pages belonging to mst protected attributes while
703 accessing them to ensure we never see corrupt data while the page is
704 under writeout.
705 - Add framework for generic ntfs collation (fs/ntfs/collation.[hc]).
706 We have ntfs_is_collation_rule_supported() to check if the collation
707 rule you want to use is supported and ntfs_collation() which actually
708 collates two data items. We currently only support COLLATION_BINARY
709 and COLLATION_NTOFS_ULONG but support for other collation rules will
710 be added as the need arises.
711 - Add a new type, ntfs_index_context, to allow retrieval of an index
712 entry using the corresponding index key. To get an index context,
713 use ntfs_index_ctx_get() and to release it, use ntfs_index_ctx_put().
714 This also adds a new slab cache for the index contexts. To lookup a
715 key in an index inode, use ntfs_index_lookup(). After modifying an
716 index entry, call ntfs_index_entry_flush_dcache_page() followed by
717 ntfs_index_entry_mark_dirty() to ensure the changes are written out
718 to disk. For details see fs/ntfs/index.[hc]. Note, at present, if
719 an index entry is in the index allocation attribute rather than the
720 index root attribute it will not be written out (you will get a
721 warning message about discarded changes instead).
722 - Load the quota file ($Quota) and check if quota tracking is enabled
723 and if so, mark the quotas out of date. This causes windows to
724 rescan the volume on boot and update all quota entries.
725 - Add a set_page_dirty address space operation for ntfs_m[fs]t_aops.
726 It is simply set to __set_page_dirty_nobuffers() to make sure that
727 running set_page_dirty() on a page containing mft/ntfs records will
728 not affect the dirty state of the page buffers.
729 - Add fs/ntfs/index.c::__ntfs_index_entry_mark_dirty() which sets all
730 buffers that are inside the ntfs record in the page dirty after which
731 it sets the page dirty. This allows ->writepage to only write the
732 dirty index records rather than having to write all the records in
733 the page. Modify fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to
734 use this rather than __set_page_dirty_nobuffers().
735 - Implement fs/ntfs/aops.c::ntfs_write_mst_block() which enables the
736 writing of page cache pages belonging to mst protected attributes
737 like the index allocation attribute in directory indices and other
738 indices like $Quota/$Q, etc. This means that the quota is now marked
739 out of date on all volumes rather than only on ones where the quota
740 defaults entry is in the index root attribute of the $Quota/$Q index.
741
7422.1.14 - Fix an NFSd caused deadlock reported by several users.
743
744 - Modify fs/ntfs/ntfs_readdir() to copy the index root attribute value
745 to a buffer so that we can put the search context and unmap the mft
746 record before calling the filldir() callback. We need to do this
747 because of NFSd which calls ->lookup() from its filldir callback()
748 and this causes NTFS to deadlock as ntfs_lookup() maps the mft record
749 of the directory and since ntfs_readdir() has got it mapped already
750 ntfs_lookup() deadlocks.
751
7522.1.13 - Enable overwriting of resident files and housekeeping of system files.
753
754 - Implement writing of mft records (fs/ntfs/mft.[hc]), which includes
755 keeping the mft mirror in sync with the mft when mirrored mft records
756 are written. The functions are write_mft_record{,_nolock}(). The
757 implementation is quite rudimentary for now with lots of things not
758 implemented yet but I am not sure any of them can actually occur so
759 I will wait for people to hit each one and only then implement it.
760 - Commit open system inodes at umount time. This should make it
761 virtually impossible for sync_mft_mirror_umount() to ever be needed.
762 - Implement ->write_inode (fs/ntfs/inode.c::ntfs_write_inode()) for the
763 ntfs super operations. This gives us inode writing via the VFS inode
764 dirty code paths. Note: Access time updates are not implemented yet.
765 - Implement fs/ntfs/mft.[hc]::{,__}mark_mft_record_dirty() and make
766 fs/ntfs/aops.c::ntfs_writepage() and ntfs_commit_write() use it, thus
767 finally enabling resident file overwrite! (-8 This also includes a
768 placeholder for ->writepage (ntfs_mft_writepage()), which for now
769 just redirties the page and returns. Also, at umount time, we for
770 now throw away all mft data page cache pages after the last call to
771 ntfs_commit_inode() in the hope that all inodes will have been
772 written out by then and hence no dirty (meta)data will be lost. We
773 also check for this case and emit an error message telling the user
774 to run chkdsk.
775 - Use set_page_writeback() and end_page_writeback() in the resident
776 attribute code path of fs/ntfs/aops.c::ntfs_writepage() otherwise
777 the radix-tree tag PAGECACHE_TAG_DIRTY remains set even though the
778 page is clean.
779 - Implement ntfs_mft_writepage() so it now checks if any of the mft
780 records in the page are dirty and if so redirties the page and
781 returns. Otherwise it just returns (after doing set_page_writeback(),
782 unlock_page(), end_page_writeback() or the radix-tree tag
783 PAGECACHE_TAG_DIRTY remains set even though the page is clean), thus
784 alowing the VM to do with the page as it pleases. Also, at umount
785 time, now only throw away dirty mft (meta)data pages if dirty inodes
786 are present and ask the user to email us if they see this happening.
787 - Add functions ntfs_{clear,set}_volume_flags(), to modify the volume
788 information flags (fs/ntfs/super.c).
789 - Mark the volume dirty when (re)mounting read-write and mark it clean
790 when unmounting or remounting read-only. If any volume errors are
791 found, the volume is left marked dirty to force chkdsk to run.
792 - Add code to set the NT4 compatibility flag when (re)mounting
793 read-write for newer NTFS versions but leave it commented out for now
794 since we do not make any modifications that are NTFS 1.2 specific yet
795 and since setting this flag breaks Captive-NTFS which is not nice.
796 This code must be enabled once we start writing NTFS 1.2 specific
797 changes otherwise Windows NTFS driver might crash / cause corruption.
798
7992.1.12 - Fix the second fix to the decompression engine and some cleanups.
800
801 - Add a new address space operations struct, ntfs_mst_aops, for mst
802 protected attributes. This is because the default ntfs_aops do not
803 make sense with mst protected data and were they to write anything to
804 such an attribute they would cause data corruption so we provide
805 ntfs_mst_aops which does not have any write related operations set.
806 - Cleanup dirty ntfs inode handling (fs/ntfs/inode.[hc]) which also
807 includes an adapted ntfs_commit_inode() and an implementation of
808 ntfs_write_inode() which for now just cleans dirty inodes without
809 writing them (it does emit a warning that this is happening).
810 - Undo the second decompression engine fix (see 2.1.9 release ChangeLog
811 entry) as it was only fixing a theoretical bug but at the same time
812 it badly broke the handling of sparse and uncompressed compression
813 blocks.
814
8152.1.11 - Driver internal cleanups.
816
817 - Only build logfile.o if building the driver with read-write support.
818 - Really final white space cleanups.
819 - Use generic_ffs() instead of ffs() in logfile.c which allows the
820 log_page_size variable to be optimized by gcc into a constant.
821 - Rename uchar_t to ntfschar everywhere as uchar_t is unsigned 1-byte
822 char as defined by POSIX and as found on some systems.
823
8242.1.10 - Force read-only (re)mounting of volumes with unsupported volume flags.
825
826 - Finish off the white space cleanups (remove trailing spaces, etc).
827 - Clean up ntfs_fill_super() and ntfs_read_inode_mount() by removing
828 the kludges around the first iget(). Instead of (re)setting ->s_op
829 we have the $MFT inode set up by explicit new_inode() / set ->i_ino /
830 insert_inode_hash() / call ntfs_read_inode_mount() directly. This
831 kills the need for second super_operations and allows to return error
832 from ntfs_read_inode_mount() without resorting to ugly "poisoning"
833 tricks. (Al Viro)
834 - Force read-only (re)mounting if any of the following bits are set in
835 the volume information flags:
836 VOLUME_IS_DIRTY, VOLUME_RESIZE_LOG_FILE,
837 VOLUME_UPGRADE_ON_MOUNT, VOLUME_DELETE_USN_UNDERWAY,
838 VOLUME_REPAIR_OBJECT_ID, VOLUME_MODIFIED_BY_CHKDSK
839 To make this easier we define VOLUME_MUST_MOUNT_RO_MASK with all the
840 above bits set so the test is made easy.
841
8422.1.9 - Fix two bugs in decompression engine.
843
844 - Fix a bug where we would not always detect that we have reached the
845 end of a compression block because we were ending at minus one byte
846 which is effectively the same as being at the end. The fix is to
847 check whether the uncompressed buffer has been fully filled and if so
848 we assume we have reached the end of the compression block. A big
849 thank you to Marcin Gibuła for the bug report, the assistance in
850 tracking down the bug and testing the fix.
851 - Fix a possible bug where when a compressed read is truncated to the
852 end of the file, the offset inside the last page was not truncated.
853
8542.1.8 - Handle $MFT mirror and $LogFile, improve time handling, and cleanups.
855
856 - Use get_bh() instead of manual atomic_inc() in fs/ntfs/compress.c.
857 - Modify fs/ntfs/time.c::ntfs2utc(), get_current_ntfs_time(), and
858 utc2ntfs() to work with struct timespec instead of time_t on the
859 Linux UTC time side thus preserving the full precision of the NTFS
860 time and only loosing up to 99 nano-seconds in the Linux UTC time.
861 - Move fs/ntfs/time.c to fs/ntfs/time.h and make the time functions
862 static inline.
863 - Remove unused ntfs_dirty_inode().
864 - Cleanup super operations declaration in fs/ntfs/super.c.
865 - Wrap flush_dcache_mft_record_page() in #ifdef NTFS_RW.
866 - Add NInoTestSetFoo() and NInoTestClearFoo() macro magic to
867 fs/ntfs/inode.h and use it to declare NInoTest{Set,Clear}Dirty.
868 - Move typedefs for ntfs_attr and test_t from fs/ntfs/inode.c to
869 fs/ntfs/inode.h so they can be used elsewhere.
870 - Determine the mft mirror size as the number of mirrored mft records
871 and store it in ntfs_volume->mftmirr_size (fs/ntfs/super.c).
872 - Load the mft mirror at mount time and compare the mft records stored
873 in it to the ones in the mft. Force a read-only mount if the two do
874 not match (fs/ntfs/super.c).
875 - Fix type casting related warnings on 64-bit architectures. Thanks
876 to Meelis Roos for reporting them.
877 - Move %L to %ll as %L is floating point and %ll is integer which is
878 what we want.
879 - Read the journal ($LogFile) and determine if the volume has been
880 shutdown cleanly and force a read-only mount if not (fs/ntfs/super.c
881 and fs/ntfs/logfile.c). This is a little bit of a crude check in
882 that we only look at the restart areas and not at the actual log
883 records so that there will be a very small number of cases where we
884 think that a volume is dirty when in fact it is clean. This should
885 only affect volumes that have not been shutdown cleanly and did not
886 have any pending, non-check-pointed i/o.
887 - If the $LogFile indicates a clean shutdown and a read-write (re)mount
888 is requested, empty $LogFile by overwriting it with 0xff bytes to
889 ensure that Windows cannot cause data corruption by replaying a stale
890 journal after Linux has written to the volume.
891
8922.1.7 - Enable NFS exporting of mounted NTFS volumes.
893
894 - Set i_generation in the VFS inode from the seq_no of the NTFS inode.
895 - Make ntfs_lookup() NFS export safe, i.e. use d_splice_alias(), etc.
896 - Implement ->get_dentry() in fs/ntfs/namei.c::ntfs_get_dentry() as the
897 default doesn't allow inode number 0 which is a valid inode on NTFS
898 and even if it did allow that it uses iget() instead of ntfs_iget()
899 which makes it useless for us.
900 - Implement ->get_parent() in fs/ntfs/namei.c::ntfs_get_parent() as the
901 default just returns -EACCES which is not very useful.
902 - Define export operations (->s_export_op) for NTFS (ntfs_export_ops)
903 and set them up in the super block at mount time (super.c) this
904 allows mounted NTFS volumes to be exported via NFS.
905 - Add missing return -EOPNOTSUPP; in
906 fs/ntfs/aops.c::ntfs_commit_nonresident_write().
907 - Enforce no atime and no dir atime updates at mount/remount time as
908 they are not implemented yet anyway.
909 - Move a few assignments in fs/ntfs/attrib.c::load_attribute_list() to
910 after a NULL check. Thanks to Dave Jones for pointing this out.
911
9122.1.6 - Fix minor bug in handling of compressed directories.
913
914 - Fix bug in handling of compressed directories. A compressed
915 directory is not really compressed so when we set the ->i_blocks
916 field of a compressed directory inode we were setting it from the
917 non-existing field ni->itype.compressed.size which gave random
918 results... For directories we now always use ni->allocated_size.
919
9202.1.5 - Fix minor bug in attribute list attribute handling.
921
922 - Fix bug in attribute list handling. Actually it is not as much a bug
923 as too much protection in that we were not allowing attribute lists
924 which waste space on disk while Windows XP clearly allows it and in
925 fact creates such attribute lists so our driver was failing.
926 - Update NTFS documentation ready for 2.6 kernel release.
927
9282.1.4 - Reduce compiler requirements.
929
930 - Remove all uses of unnamed structs and unions in the driver to make
931 old and newer gcc versions happy. Makes it a bit uglier IMO but at
932 least people will stop hassling me about it.
933
9342.1.3 - Important bug fixes in corner cases.
935
936 - super.c::parse_ntfs_boot_sector(): Correct the check for 64-bit
937 clusters. (Philipp Thomas)
938 - attrib.c::load_attribute_list(): Fix bug when initialized_size is a
939 multiple of the block_size but not the cluster size. (Szabolcs
940 Szakacsits)
941
9422.1.2 - Important bug fixes aleviating the hangs in statfs.
943
944 - Fix buggy free cluster and free inode determination logic.
945
9462.1.1 - Minor updates.
947
948 - Add handling for initialized_size != data_size in compressed files.
949 - Reduce function local stack usage from 0x3d4 bytes to just noise in
950 fs/ntfs/upcase.c. (Randy Dunlap)
951 - Remove compiler warnings for newer gcc.
952 - Pages are no longer kmapped by mm/filemap.c::generic_file_write()
953 around calls to ->{prepare,commit}_write. Adapt NTFS appropriately
954 in fs/ntfs/aops.c::ntfs_prepare_nonresident_write() by using
955 kmap_atomic(KM_USER0).
956
9572.1.0 - First steps towards write support: implement file overwrite.
958
959 - Add configuration option for developmental write support with an
960 appropriately scary configuration help text.
961 - Initial implementation of fs/ntfs/aops.c::ntfs_writepage() and its
962 helper fs/ntfs/aops.c::ntfs_write_block(). This enables mmap(2) based
963 overwriting of existing files on ntfs. Note: Resident files are
964 only written into memory, and not written out to disk at present, so
965 avoid writing to files smaller than about 1kiB.
966 - Initial implementation of fs/ntfs/aops.c::ntfs_prepare_write(), its
967 helper fs/ntfs/aops.c::ntfs_prepare_nonresident_write() and their
968 counterparts, fs/ntfs/aops.c::ntfs_commit_write(), and
969 fs/ntfs/aops.c::ntfs_commit_nonresident_write(), respectively. Also,
970 add generic_file_write() to the ntfs file operations (fs/ntfs/file.c).
971 This enables write(2) based overwriting of existing files on ntfs.
972 Note: As with mmap(2) based overwriting, resident files are only
973 written into memory, and not written out to disk at present, so avoid
974 writing to files smaller than about 1kiB.
975 - Implement ->truncate (fs/ntfs/inode.c::ntfs_truncate()) and
976 ->setattr() (fs/ntfs/inode.c::ntfs_setattr()) inode operations for
977 files with the purpose of intercepting and aborting all i_size
978 changes which we do not support yet. ntfs_truncate() actually only
979 emits a warning message but AFAICS our interception of i_size changes
980 elsewhere means ntfs_truncate() never gets called for i_size changes.
981 It is only called from generic_file_write() when we fail in
982 ntfs_prepare_{,nonresident_}write() in order to discard any
983 instantiated buffers beyond i_size. Thus i_size is not actually
984 changed so our warning message is enough. Unfortunately it is not
985 possible to easily determine if i_size is being changed or not hence
986 we just emit an appropriately worded error message.
987
9882.0.25 - Small bug fixes and cleanups.
989
990 - Unlock the page in an out of memory error code path in
991 fs/ntfs/aops.c::ntfs_read_block().
992 - If fs/ntfs/aops.c::ntfs_read_page() is called on an uptodate page,
993 just unlock the page and return. (This can happen due to ->writepage
994 clearing PageUptodate() during write out of MstProtected()
995 attributes.
996 - Remove leaked write code again.
997
9982.0.24 - Cleanups.
999
1000 - Treat BUG_ON() as ASSERT() not VERIFY(), i.e. do not use side effects
1001 inside BUG_ON(). (Adam J. Richter)
1002 - Split logical OR expressions inside BUG_ON() into individual BUG_ON()
1003 calls for improved debugging. (Adam J. Richter)
1004 - Add errors flag to the ntfs volume state, accessed via
1005 NVol{,Set,Clear}Errors(vol).
1006 - Do not allow read-write remounts of read-only volumes with errors.
1007 - Clarify comment for ntfs file operation sendfile which was added by
1008 Christoph Hellwig a while ago (just using generic_file_sendfile())
1009 to say that ntfs ->sendfile is only used for the case where the
1010 source data is on the ntfs partition and the destination is
1011 somewhere else, i.e. nothing we need to concern ourselves with.
1012 - Add generic_file_write() as our ntfs file write operation.
1013
10142.0.23 - Major bug fixes (races, deadlocks, non-i386 architectures).
1015
1016 - Massive internal locking changes to mft record locking. Fixes lock
1017 recursion and replaces the mrec_lock read/write semaphore with a
1018 mutex. Also removes the now superfluous mft_count. This fixes several
1019 race conditions and deadlocks, especially in the future write code.
1020 - Fix ntfs over loopback for compressed files by adding an
1021 optimization barrier. (gcc was screwing up otherwise ?)
1022 - Miscellaneous cleanups all over the code and a fix or two in error
1023 handling code paths.
1024 Thanks go to Christoph Hellwig for pointing out the following two:
1025 - Remove now unused function fs/ntfs/malloc.h::vmalloc_nofs().
1026 - Fix ntfs_free() for ia64 and parisc by checking for VMALLOC_END, too.
1027
10282.0.22 - Cleanups, mainly to ntfs_readdir(), and use C99 initializers.
1029
1030 - Change fs/ntfs/dir.c::ntfs_reddir() to only read/write ->f_pos once
1031 at entry/exit respectively.
1032 - Use C99 initializers for structures.
1033 - Remove unused variable blocks from fs/ntfs/aops.c::ntfs_read_block().
1034
10352.0.21 - Check for, and refuse to work with too large files/directories/volumes.
1036
1037 - Limit volume size at mount time to 2TiB on architectures where
1038 unsigned long is 32-bits (fs/ntfs/super.c::parse_ntfs_boot_sector()).
1039 This is the most we can do without overflowing the 32-bit limit of
1040 the block device size imposed on us by sb_bread() and sb_getblk()
1041 for the time being.
1042 - Limit file/directory size at open() time to 16TiB on architectures
1043 where unsigned long is 32-bits (fs/ntfs/file.c::ntfs_file_open() and
1044 fs/ntfs/dir.c::ntfs_dir_open()). This is the most we can do without
1045 overflowing the page cache page index.
1046
10472.0.20 - Support non-resident directory index bitmaps, fix page leak in readdir.
1048
1049 - Move the directory index bitmap to use an attribute inode instead of
1050 having special fields for it inside the ntfs inode structure. This
1051 means that the index bitmaps now use the page cache for i/o, too,
1052 and also as a side effect we get support for non-resident index
1053 bitmaps for free.
1054 - Simplify/cleanup error handling in fs/ntfs/dir.c::ntfs_readdir() and
1055 fix a page leak that manifested itself in some cases.
1056 - Add fs/ntfs/inode.c::ntfs_put_inode(), which we need to release the
1057 index bitmap inode on the final iput().
1058
10592.0.19 - Fix race condition, improvements, and optimizations in i/o interface.
1060
1061 - Apply block optimization added to fs/ntfs/aops.c::ntfs_read_block()
1062 to fs/ntfs/compress.c::ntfs_file_read_compressed_block() as well.
1063 - Drop the "file" from ntfs_file_read_compressed_block().
1064 - Rename fs/ntfs/aops.c::ntfs_enb_buffer_read_async() to
1065 ntfs_end_buffer_async_read() (more like the fs/buffer.c counterpart).
1066 - Update ntfs_end_buffer_async_read() with the improved logic from
1067 its updated counterpart fs/buffer.c::end_buffer_async_read(). Apply
1068 further logic improvements to better determine when we set PageError.
1069 - Update submission of buffers in fs/ntfs/aops.c::ntfs_read_block() to
1070 check for the buffers being uptodate first in line with the updated
1071 fs/buffer.c::block_read_full_page(). This plugs a small race
1072 condition.
1073
10742.0.18 - Fix race condition in reading of compressed files.
1075
1076 - There was a narrow window between checking a buffer head for being
1077 uptodate and locking it in ntfs_file_read_compressed_block(). We now
1078 lock the buffer and then check whether it is uptodate or not.
1079
10802.0.17 - Cleanups and optimizations - shrinking the ToDo list.
1081
1082 - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to return an error
1083 code and update callers, i.e. ntfs_iget(), to pass that error code
1084 up instead of just using -EIO.
1085 - Modifications to super.c to ensure that both mount and remount
1086 cannot set any write related options when the driver is compiled
1087 read-only.
1088 - Optimize block resolution in fs/ntfs/aops.c::ntfs_read_block() to
1089 cache the current runlist element. This should improve performance
1090 when reading very large and/or very fragmented data.
1091
10922.0.16 - Convert access to $MFT/$BITMAP to attribute inode API.
1093
1094 - Fix a stupid bug introduced in 2.0.15 where we were unmapping the
1095 wrong inode in fs/ntfs/inode.c::ntfs_attr_iget().
1096 - Fix debugging check in fs/ntfs/aops.c::ntfs_read_block().
1097 - Convert $MFT/$BITMAP access to attribute inode API and remove all
1098 remnants of the ugly mftbmp address space and operations hack. This
1099 means we finally have only one readpage function as well as only one
1100 async io completion handler. Yey! The mft bitmap is now just an
1101 attribute inode and is accessed from vol->mftbmp_ino just as if it
1102 were a normal file. Fake inodes rule. (-:
1103
11042.0.15 - Fake inodes based attribute i/o via the pagecache, fixes and cleanups.
1105
1106 - Fix silly bug in fs/ntfs/super.c::parse_options() which was causing
1107 remounts to fail when the partition had an entry in /etc/fstab and
1108 the entry specified the nls= option.
1109 - Apply same macro magic used in fs/ntfs/inode.h to fs/ntfs/volume.h to
1110 expand all the helper functions NVolFoo(), NVolSetFoo(), and
1111 NVolClearFoo().
1112 - Move copyright statement from driver initialisation message to
1113 module description (fs/super.c). This makes the initialisation
1114 message fit on one line and fits in better with rest of kernel.
1115 - Update fs/ntfs/attrib.c::map_run_list() to work on both real and
1116 attribute inodes, and both for files and directories.
1117 - Implement fake attribute inodes allowing all attribute i/o to go via
1118 the page cache and to use all the normal vfs/mm functionality:
1119 - Add ntfs_attr_iget() and its helper ntfs_read_locked_attr_inode()
1120 to fs/ntfs/inode.c.
1121 - Add needed cleanup code to ntfs_clear_big_inode().
1122 - Merge address space operations for files and directories (aops.c),
1123 now just have ntfs_aops:
1124 - Rename:
1125 end_buffer_read_attr_async() -> ntfs_end_buffer_read_async(),
1126 ntfs_attr_read_block() -> ntfs_read_block(),
1127 ntfs_file_read_page() -> ntfs_readpage().
1128 - Rewrite fs/ntfs/aops.c::ntfs_readpage() to work on both real and
1129 attribute inodes, and both for files and directories.
1130 - Remove obsolete fs/ntfs/aops.c::ntfs_mst_readpage().
1131
11322.0.14 - Run list merging code cleanup, minor locking changes, typo fixes.
1133
1134 - Change fs/ntfs/super.c::ntfs_statfs() to not rely on BKL by moving
1135 the locking out of super.c::get_nr_free_mft_records() and taking and
1136 dropping the mftbmp_lock rw_semaphore in ntfs_statfs() itself.
1137 - Bring attribute runlist merging code (fs/ntfs/attrib.c) in sync with
1138 current userspace ntfs library code. This means that if a merge
1139 fails the original runlists are always left unmodified instead of
1140 being silently corrupted.
1141 - Misc typo fixes.
1142
11432.0.13 - Use iget5_locked() in preparation for fake inodes and small cleanups.
1144
1145 - Remove nr_mft_bits and the now superfluous union with nr_mft_records
1146 from ntfs_volume structure.
1147 - Remove nr_lcn_bits and the now superfluous union with nr_clusters
1148 from ntfs_volume structure.
1149 - Use iget5_locked() and friends instead of conventional iget(). Wrap
1150 the call in fs/ntfs/inode.c::ntfs_iget() and update callers of iget()
1151 to use ntfs_iget(). Leave only one iget() call at mount time so we
1152 don't need an ntfs_iget_mount().
1153 - Change fs/ntfs/inode.c::ntfs_new_extent_inode() to take mft_no as an
1154 additional argument.
1155
11562.0.12 - Initial cleanup of address space operations following 2.0.11 changes.
1157
1158 - Merge fs/ntfs/aops.c::end_buffer_read_mst_async() and
1159 fs/ntfs/aops.c::end_buffer_read_file_async() into one function
1160 fs/ntfs/aops.c::end_buffer_read_attr_async() using NInoMstProtected()
1161 to determine whether to apply mst fixups or not.
1162 - Above change allows merging fs/ntfs/aops.c::ntfs_file_read_block()
1163 and fs/ntfs/aops.c::ntfs_mst_readpage() into one function
1164 fs/ntfs/aops.c::ntfs_attr_read_block(). Also, create a tiny wrapper
1165 fs/ntfs/aops.c::ntfs_mst_readpage() to transform the parameters from
1166 the VFS readpage function prototype to the ntfs_attr_read_block()
1167 function prototype.
1168
11692.0.11 - Initial preparations for fake inode based attribute i/o.
1170
1171 - Move definition of ntfs_inode_state_bits to fs/ntfs/inode.h and
1172 do some macro magic (adapted from include/linux/buffer_head.h) to
1173 expand all the helper functions NInoFoo(), NInoSetFoo(), and
1174 NInoClearFoo().
1175 - Add new flag to ntfs_inode_state_bits: NI_Sparse.
1176 - Add new fields to ntfs_inode structure to allow use of fake inodes
1177 for attribute i/o: type, name, name_len. Also add new state bits:
1178 NI_Attr, which, if set, indicates the inode is a fake inode, and
1179 NI_MstProtected, which, if set, indicates the attribute uses multi
1180 sector transfer protection, i.e. fixups need to be applied after
1181 reads and before/after writes.
1182 - Rename fs/ntfs/inode.c::ntfs_{new,clear,destroy}_inode() to
1183 ntfs_{new,clear,destroy}_extent_inode() and update callers.
1184 - Use ntfs_clear_extent_inode() in fs/ntfs/inode.c::__ntfs_clear_inode()
1185 instead of ntfs_destroy_extent_inode().
1186 - Cleanup memory deallocations in {__,}ntfs_clear_{,big_}inode().
1187 - Make all operations on ntfs inode state bits use the NIno* functions.
1188 - Set up the new ntfs inode fields and state bits in
1189 fs/ntfs/inode.c::ntfs_read_inode() and add appropriate cleanup of
1190 allocated memory to __ntfs_clear_inode().
1191 - Cleanup ntfs_inode structure a bit for better ordering of elements
1192 w.r.t. their size to allow better packing of the structure in memory.
1193
11942.0.10 - There can only be 2^32 - 1 inodes on an NTFS volume.
1195
1196 - Add check at mount time to verify that the number of inodes on the
1197 volume does not exceed 2^32 - 1, which is the maximum allowed for
1198 NTFS according to Microsoft.
1199 - Change mft_no member of ntfs_inode structure to be unsigned long.
1200 Update all users. This makes ntfs_inode->mft_no just a copy of struct
1201 inode->i_ino. But we can't just always use struct inode->i_ino and
1202 remove mft_no because extent inodes do not have an attached struct
1203 inode.
1204
12052.0.9 - Decompression engine now uses a single buffer and other cleanups.
1206
1207 - Change decompression engine to use a single buffer protected by a
1208 spin lock instead of per-CPU buffers. (Rusty Russell)
1209 - Do not update cb_pos when handling a partial final page during
1210 decompression of a sparse compression block, as the value is later
1211 reset without being read/used. (Rusty Russell)
1212 - Switch to using the new KM_BIO_SRC_IRQ for atomic kmap()s. (Andrew
1213 Morton)
1214 - Change buffer size in ntfs_readdir()/ntfs_filldir() to use
1215 NLS_MAX_CHARSET_SIZE which makes the buffers almost 1kiB each but
1216 it also makes everything safer so it is a good thing.
1217 - Miscellaneous minor cleanups to comments.
1218
12192.0.8 - Major updates for handling of case sensitivity and dcache aliasing.
1220
1221 Big thanks go to Al Viro and other inhabitants of #kernel for investing
1222 their time to discuss the case sensitivity and dcache aliasing issues.
1223
1224 - Remove unused source file fs/ntfs/attraops.c.
1225 - Remove show_inodes mount option(s), thus dropping support for
1226 displaying of short file names.
1227 - Remove deprecated mount option posix.
1228 - Restore show_sys_files mount option.
1229 - Add new mount option case_sensitive, to determine if the driver
1230 treats file names as case sensitive or not. If case sensitive, create
1231 file names in the POSIX namespace. Otherwise create file names in the
1232 LONG/WIN32 namespace. Note, files remain accessible via their short
1233 file name, if it exists.
1234 - Remove really dumb logic bug in boot sector recovery code.
1235 - Fix dcache aliasing issues wrt short/long file names via changes
1236 to fs/ntfs/dir.c::ntfs_lookup_inode_by_name() and
1237 fs/ntfs/namei.c::ntfs_lookup():
1238 - Add additional argument to ntfs_lookup_inode_by_name() in which we
1239 return information about the matching file name if the case is not
1240 matching or the match is a short file name. See comments above the
1241 function definition for details.
1242 - Change ntfs_lookup() to only create dcache entries for the correctly
1243 cased file name and only for the WIN32 namespace counterpart of DOS
1244 namespace file names. This ensures we have only one dentry per
1245 directory and also removes all dcache aliasing issues between short
1246 and long file names once we add write support. See comments above
1247 function for details.
1248 - Fix potential 1 byte overflow in fs/ntfs/unistr.c::ntfs_ucstonls().
1249
12502.0.7 - Minor cleanups and updates for changes in core kernel code.
1251
1252 - Remove much of the NULL struct element initializers.
1253 - Various updates to make compatible with recent kernels.
1254 - Remove defines of MAX_BUF_PER_PAGE and include linux/buffer_head.h
1255 in fs/ntfs/ntfs.h instead.
1256 - Remove no longer needed KERNEL_VERSION checks. We are now in the
1257 kernel proper so they are no longer needed.
1258
12592.0.6 - Major bugfix to make compatible with other kernel changes.
1260
1261 - Initialize the mftbmp address space properly now that there are more
1262 fields in the struct address_space. This was leading to hangs and
1263 oopses on umount since 2.5.12 because of changes to other parts of
1264 the kernel. We probably want a kernel generic init_address_space()
1265 function...
1266 - Drop BKL from ntfs_readdir() after consultation with Al Viro. The
1267 only caller of ->readdir() is vfs_readdir() which holds i_mutex
1268 during the call, and i_mutex is sufficient protection against changes
1269 in the directory inode (including ->i_size).
1270 - Use generic_file_llseek() for directories (as opposed to
1271 default_llseek()) as this downs i_mutex instead of the BKL which is
1272 what we now need for exclusion against ->f_pos changes considering we
1273 no longer take the BKL in ntfs_readdir().
1274
12752.0.5 - Major bugfix. Buffer overflow in extent inode handling.
1276
1277 - No need to set old blocksize in super.c::ntfs_fill_super() as the
1278 VFS does so via invocation of deactivate_super() calling
1279 fs->fill_super() calling block_kill_super() which does it.
1280 - BKL moved from VFS into dir.c::ntfs_readdir(). (Linus Torvalds)
1281 -> Do we really need it? I don't think so as we have exclusion on
1282 the directory ntfs_inode rw_semaphore mrec_lock. We mmight have to
1283 move the ->f_pos accesses under the mrec_lock though. Check this...
1284 - Fix really, really, really stupid buffer overflow in extent inode
1285 handling in mft.c::map_extent_mft_record().
1286
12872.0.4 - Cleanups and updates for kernel 2.5.11.
1288
1289 - Add documentation on how to use the MD driver to be able to use NTFS
1290 stripe and volume sets in Linux and generally cleanup documentation
1291 a bit.
1292 Remove all uses of kdev_t in favour of struct block_device *:
1293 - Change compress.c::ntfs_file_read_compressed_block() to use
1294 sb_getblk() instead of getblk().
1295 - Change super.c::ntfs_fill_super() to use bdev_hardsect_size() instead
1296 of get_hardsect_size().
1297 - No need to get old blocksize in super.c::ntfs_fill_super() as
1298 fs/super.c::get_sb_bdev() already does this.
1299 - Set bh->b_bdev instead of bh->b_dev throughout aops.c.
1300
13012.0.3 - Small bug fixes, cleanups, and performance improvements.
1302
1303 - Remove some dead code from mft.c.
1304 - Optimize readpage and read_block functions throughout aops.c so that
1305 only initialized blocks are read. Non-initialized ones have their
1306 buffer head mapped, zeroed, and set up to date, without scheduling
1307 any i/o. Thanks to Al Viro for advice on how to avoid the device i/o.
1308 Thanks go to Andrew Morton for spotting the below:
1309 - Fix buglet in allocate_compression_buffers() error code path.
1310 - Call flush_dcache_page() after modifying page cache page contents in
1311 ntfs_file_readpage().
1312 - Check for existence of page buffers throughout aops.c before calling
1313 create_empty_buffers(). This happens when an I/O error occurs and the
1314 read is retried. (It also happens once writing is implemented so that
1315 needed doing anyway but I had left it for later...)
1316 - Don't BUG_ON() uptodate and/or mapped buffers throughout aops.c in
1317 readpage and read_block functions. Reasoning same as above (i.e. I/O
1318 error retries and future write code paths.)
1319
13202.0.2 - Minor updates and cleanups.
1321
1322 - Cleanup: rename mst.c::__post_read_mst_fixup to post_write_mst_fixup
1323 and cleanup the code a bit, removing the unused size parameter.
1324 - Change default fmask to 0177 and update documentation.
1325 - Change attrib.c::get_attr_search_ctx() to return the search context
1326 directly instead of taking the address of a pointer. A return value
1327 of NULL means the allocation failed. Updated all callers
1328 appropriately.
1329 - Update to 2.5.9 kernel (preserving backwards compatibility) by
1330 replacing all occurences of page->buffers with page_buffers(page).
1331 - Fix minor bugs in runlist merging, also minor cleanup.
1332 - Updates to bootsector layout and mft mirror contents descriptions.
1333 - Small bug fix in error detection in unistr.c and some cleanups.
1334 - Grow name buffer allocations in unistr.c in aligned mutlipled of 64
1335 bytes.
1336
13372.0.1 - Minor updates.
1338
1339 - Make default umask correspond to documentation.
1340 - Improve documentation.
1341 - Set default mode to include execute bit. The {u,f,d}mask can be used
1342 to take it away if desired. This allows binaries to be executed from
1343 a mounted ntfs partition.
1344
13452.0.0 - New version number. Remove TNG from the name. Now in the kernel.
1346
1347 - Add kill_super, just keeping up with the vfs changes in the kernel.
1348 - Repeat some changes from tng-0.0.8 that somehow got lost on the way
1349 from the CVS import into BitKeeper.
1350 - Begin to implement proper handling of allocated_size vs
1351 initialized_size vs data_size (i.e. i_size). Done are
1352 mft.c::ntfs_mft_readpage(), aops.c::end_buffer_read_index_async(),
1353 and attrib.c::load_attribute_list().
1354 - Lock the runlist in attrib.c::load_attribute_list() while using it.
1355 - Fix memory leak in ntfs_file_read_compressed_block() and generally
1356 clean up compress.c a little, removing some uncommented/unused debug
1357 code.
1358 - Tidy up dir.c a little bit.
1359 - Don't bother getting the runlist in inode.c::ntfs_read_inode().
1360 - Merge mft.c::ntfs_mft_readpage() and aops.c::ntfs_index_readpage()
1361 creating aops.c::ntfs_mst_readpage(), improving the handling of
1362 holes and overflow in the process and implementing the correct
1363 equivalent of ntfs_file_get_block() in ntfs_mst_readpage() itself.
1364 I am aiming for correctness at the moment. Modularisation can come
1365 later.
1366 - Rename aops.c::end_buffer_read_index_async() to
1367 end_buffer_read_mst_async() and optimize the overflow checking and
1368 handling.
1369 - Use the host of the mftbmp address space mapping to hold the ntfs
1370 volume. This is needed so the async i/o completion handler can
1371 retrieve a pointer to the volume. Hopefully this will not cause
1372 problems elsewhere in the kernel... Otherwise will need to use a
1373 fake inode.
1374 - Complete implementation of proper handling of allocated_size vs
1375 initialized_size vs data_size (i.e. i_size) in whole driver.
1376 Basically aops.c is now completely rewritten.
1377 - Change NTFS driver name to just NTFS and set version number to 2.0.0
1378 to make a clear distinction from the old driver which is still on
1379 version 1.1.22.
1380
1381tng-0.0.8 - 08/03/2002 - Now using BitKeeper, http://linux-ntfs.bkbits.net/
1382
1383 - Replace bdevname(sb->s_dev) with sb->s_id.
1384 - Remove now superfluous new-line characters in all callers of
1385 ntfs_debug().
1386 - Apply kludge in ntfs_read_inode(), setting i_nlink to 1 for
1387 directories. Without this the "find" utility gets very upset which is
1388 fair enough as Linux/Unix do not support directory hard links.
1389 - Further runlist merging work. (Richard Russon)
1390 - Backwards compatibility for gcc-2.95. (Richard Russon)
1391 - Update to kernel 2.5.5-pre1 and rediff the now tiny patch.
1392 - Convert to new filesystem declaration using ->ntfs_get_sb() and
1393 replacing ntfs_read_super() with ntfs_fill_super().
1394 - Set s_maxbytes to MAX_LFS_FILESIZE to avoid page cache page index
1395 overflow on 32-bit architectures.
1396 - Cleanup upcase loading code to use ntfs_(un)map_page().
1397 - Disable/reenable preemtion in critical sections of compession engine.
1398 - Replace device size determination in ntfs_fill_super() with
1399 sb->s_bdev->bd_inode->i_size (in bytes) and remove now superfluous
1400 function super.c::get_nr_blocks().
1401 - Implement a mount time option (show_inodes) allowing choice of which
1402 types of inode names readdir() returns and modify ntfs_filldir()
1403 accordingly. There are several parameters to show_inodes:
1404 system: system files
1405 win32: long file names (including POSIX file names) [DEFAULT]
1406 long: same as win32
1407 dos: short file names only (excluding POSIX file names)
1408 short: same as dos
1409 posix: same as both win32 and dos
1410 all: all file names
1411 Note that the options are additive, i.e. specifying:
1412 -o show_inodes=system,show_inodes=win32,show_inodes=dos
1413 is the same as specifying:
1414 -o show_inodes=all
1415 Note that the "posix" and "all" options will show all directory
1416 names, BUT the link count on each directory inode entry is set to 1,
1417 due to Linux not supporting directory hard links. This may well
1418 confuse some userspace applications, since the directory names will
1419 have the same inode numbers. Thus it is NOT advisable to use the
1420 "posix" or "all" options. We provide them only for completeness sake.
1421 - Add copies of allocated_size, initialized_size, and compressed_size to
1422 the ntfs inode structure and set them up in
1423 inode.c::ntfs_read_inode(). These reflect the unnamed data attribute
1424 for files and the index allocation attribute for directories.
1425 - Add copies of allocated_size and initialized_size to ntfs inode for
1426 $BITMAP attribute of large directories and set them up in
1427 inode.c::ntfs_read_inode().
1428 - Add copies of allocated_size and initialized_size to ntfs volume for
1429 $BITMAP attribute of $MFT and set them up in
1430 super.c::load_system_files().
1431 - Parse deprecated ntfs driver options (iocharset, show_sys_files,
1432 posix, and utf8) and tell user what the new options to use are. Note
1433 we still do support them but they will be removed with kernel 2.7.x.
1434 - Change all occurences of integer long long printf formatting to hex
1435 as printk() will not support long long integer format if/when the
1436 div64 patch goes into the kernel.
1437 - Make slab caches have stable names and change the names to what they
1438 were intended to be. These changes are required/made possible by the
1439 new slab cache name handling which removes the length limitation by
1440 requiring the caller of kmem_cache_create() to supply a stable name
1441 which is then referenced but not copied.
1442 - Rename run_list structure to run_list_element and create a new
1443 run_list structure containing a pointer to a run_list_element
1444 structure and a read/write semaphore. Adapt all users of runlists
1445 to new scheme and take and release the lock as needed. This fixes a
1446 nasty race as the run_list changes even when inodes are locked for
1447 reading and even when the inode isn't locked at all, so we really
1448 needed the serialization. We use a semaphore rather than a spinlock
1449 as memory allocations can sleep and doing everything GFP_ATOMIC
1450 would be silly.
1451 - Cleanup read_inode() removing all code checking for lowest_vcn != 0.
1452 This can never happen due to the nature of lookup_attr() and how we
1453 support attribute lists. If it did happen it would imply the inode
1454 being corrupt.
1455 - Check for lowest_vcn != 0 in ntfs_read_inode() and mark the inode as
1456 bad if found.
1457 - Update to 2.5.6-pre2 changes in struct address_space.
1458 - Use parent_ino() when accessing d_parent inode number in dir.c.
1459 - Import Sourceforge CVS repository into BitKeeper repository:
1460 http://linux-ntfs.bkbits.net/ntfs-tng-2.5
1461 - Update fs/Makefile, fs/Config.help, fs/Config.in, and
1462 Documentation/filesystems/ntfs.txt for NTFS TNG.
1463 - Create kernel configuration option controlling whether debugging
1464 is enabled or not.
1465 - Add the required export of end_buffer_io_sync() from the patches
1466 directory to the kernel code.
1467 - Update inode.c::ntfs_show_options() with show_inodes mount option.
1468 - Update errors mount option.
1469
1470tng-0.0.7 - 13/02/2002 - The driver is now feature complete for read-only!
1471
1472 - Cleanup mft.c and it's debug/error output in particular. Fix a minor
1473 bug in mapping of extent inodes. Update all the comments to fit all
1474 the recent code changes.
1475 - Modify vcn_to_lcn() to cope with entirely unmapped runlists.
1476 - Cleanups in compress.c, mostly comments and folding help.
1477 - Implement attrib.c::map_run_list() as a generic helper.
1478 - Make compress.c::ntfs_file_read_compressed_block() use map_run_list()
1479 thus making code shorter and enabling attribute list support.
1480 - Cleanup incorrect use of [su]64 with %L printf format specifier in
1481 all source files. Type casts to [unsigned] long long added to correct
1482 the mismatches (important for architectures which have long long not
1483 being 64 bits).
1484 - Merge async io completion handlers for directory indexes and $MFT
1485 data into one by setting the index_block_size{_bits} of the ntfs
1486 inode for $MFT to the mft_record_size{_bits} of the ntfs_volume.
1487 - Cleanup aops.c, update comments.
1488 - Make ntfs_file_get_block() use map_run_list() so all files now
1489 support attribute lists.
1490 - Make ntfs_dir_readpage() almost verbatim copy of
1491 block_read_full_page() by using ntfs_file_get_block() with only real
1492 difference being the use of our own async io completion handler
1493 rather than the default one, thus reducing the amount of code and
1494 automatically enabling attribute list support for directory indices.
1495 - Fix bug in load_attribute_list() - forgot to call brelse in error
1496 code path.
1497 - Change parameters to find_attr() and lookup_attr(). We no longer
1498 pass in the upcase table and its length. These can be gotten from
1499 ctx->ntfs_ino->vol->upcase{_len}. Update all callers.
1500 - Cleanups in attrib.c.
1501 - Implement merging of runlists, attrib.c::merge_run_lists() and its
1502 helpers. (Richard Russon)
1503 - Attribute lists part 2, attribute extents and multi part runlists:
1504 enable proper support for LCN_RL_NOT_MAPPED and automatic mapping of
1505 further runlist parts via attrib.c::map_run_list().
1506 - Tiny endianness bug fix in decompress_mapping_pairs().
1507
1508tng-0.0.6 - Encrypted directories, bug fixes, cleanups, debugging enhancements.
1509
1510 - Enable encrypted directories. (Their index root is marked encrypted
1511 to indicate that new files in that directory should be created
1512 encrypted.)
1513 - Fix bug in NInoBmpNonResident() macro. (Cut and paste error.)
1514 - Enable $Extend system directory. Most (if not all) extended system
1515 files do not have unnamed data attributes so ntfs_read_inode() had to
1516 special case them but that is ok, as the special casing recovery
1517 happens inside an error code path so there is zero slow down in the
1518 normal fast path. The special casing is done by introducing a new
1519 function inode.c::ntfs_is_extended_system_file() which checks if any
1520 of the hard links in the inode point to $Extend as being their parent
1521 directory and if they do we assume this is an extended system file.
1522 - Create a sysctl/proc interface to allow {dis,en}abling of debug output
1523 when compiled with -DDEBUG. Default is debug messages to be disabled.
1524 To enable them, one writes a non-zero value to /proc/sys/fs/ntfs-debug
1525 (if /proc is enabled) or uses sysctl(2) to effect the same (if sysctl
1526 interface is enabled). Inspired by old ntfs driver.
1527 - Add debug_msgs insmod/kernel boot parameter to set whether debug
1528 messages are {dis,en}abled. This is useful to enable debug messages
1529 during ntfs initialization and is the only way to activate debugging
1530 when the sysctl interface is not enabled.
1531 - Cleanup debug output in various places.
1532 - Remove all dollar signs ($) from the source (except comments) to
1533 enable compilation on architectures whose gcc compiler does not
1534 support dollar signs in the names of variables/constants. Attribute
1535 types now start with AT_ instead of $ and $I30 is now just I30.
1536 - Cleanup ntfs_lookup() and add consistency check of sequence numbers.
1537 - Load complete runlist for $MFT/$BITMAP during mount and cleanup
1538 access functions. This means we now cope with $MFT/$BITMAP being
1539 spread accross several mft records.
1540 - Disable modification of mft_zone_multiplier on remount. We can always
1541 reenable this later on if we really want to, but we will need to make
1542 sure we readjust the mft_zone size / layout accordingly.
1543
1544tng-0.0.5 - Modernize for 2.5.x and further in line-ing with Al Viro's comments.
1545
1546 - Use sb_set_blocksize() instead of set_blocksize() and verify the
1547 return value.
1548 - Use sb_bread() instead of bread() throughout.
1549 - Add index_vcn_size{_bits} to ntfs_inode structure to store the size
1550 of a directory index block vcn. Apply resulting simplifications in
1551 dir.c everywhere.
1552 - Fix a small bug somewhere (but forgot what it was).
1553 - Change ntfs_{debug,error,warning} to enable gcc to do type checking
1554 on the printf-format parameter list and fix bugs reported by gcc
1555 as a result. (Richard Russon)
1556 - Move inode allocation strategy to Al's new stuff but maintain the
1557 divorce of ntfs_inode from struct inode. To achieve this we have two
1558 separate slab caches, one for big ntfs inodes containing a struct
1559 inode and pure ntfs inodes and at the same time fix some faulty
1560 error code paths in ntfs_read_inode().
1561 - Show mount options in proc (inode.c::ntfs_show_options()).
1562
1563tng-0.0.4 - Big changes, getting in line with Al Viro's comments.
1564
1565 - Modified (un)map_mft_record functions to be common for read and write
1566 case. To specify which is which, added extra parameter at front of
1567 parameter list. Pass either READ or WRITE to this, each has the
1568 obvious meaning.
1569 - General cleanups to allow for easier folding in vi.
1570 - attrib.c::decompress_mapping_pairs() now accepts the old runlist
1571 argument, and invokes attrib.c::merge_run_lists() to merge the old
1572 and the new runlists.
1573 - Removed attrib.c::find_first_attr().
1574 - Implemented loading of attribute list and complete runlist for $MFT.
1575 This means we now cope with $MFT being spread across several mft
1576 records.
1577 - Adapt to 2.5.2-pre9 and the changed create_empty_buffers() syntax.
1578 - Adapt major/minor/kdev_t/[bk]devname stuff to new 2.5.x kernels.
1579 - Make ntfs_volume be allocated via kmalloc() instead of using a slab
1580 cache. There are too little ntfs_volume structures at any one time
1581 to justify a private slab cache.
1582 - Fix bogus kmap() use in async io completion. Now use kmap_atomic().
1583 Use KM_BIO_IRQ on advice from IRC/kernel...
1584 - Use ntfs_map_page() in map_mft_record() and create ->readpage method
1585 for reading $MFT (ntfs_mft_readpage). In the process create dedicated
1586 address space operations (ntfs_mft_aops) for $MFT inode mapping. Also
1587 removed the now superfluous exports from the kernel core patch.
1588 - Fix a bug where kfree() was used instead of ntfs_free().
1589 - Change map_mft_record() to take ntfs_inode as argument instead of
1590 vfs inode. Dito for unmap_mft_record(). Adapt all callers.
1591 - Add pointer to ntfs_volume to ntfs_inode.
1592 - Add mft record number and sequence number to ntfs_inode. Stop using
1593 i_ino and i_generation for in-driver purposes.
1594 - Implement attrib.c::merge_run_lists(). (Richard Russon)
1595 - Remove use of proper inodes by extent inodes. Move i_ino and
1596 i_generation to ntfs_inode to do this. Apply simplifications that
1597 result and remove iget_no_wait(), etc.
1598 - Pass ntfs_inode everywhere in the driver (used to be struct inode).
1599 - Add reference counting in ntfs_inode for the ntfs inode itself and
1600 for the mapped mft record.
1601 - Extend mft record mapping so we can (un)map extent mft records (new
1602 functions (un)map_extent_mft_record), and so mappings are reference
1603 counted and don't have to happen twice if already mapped - just ref
1604 count increases.
1605 - Add -o iocharset as alias to -o nls for backwards compatibility.
1606 - The latest core patch is now tiny. In fact just a single additional
1607 export is necessary over the base kernel.
1608
1609tng-0.0.3 - Cleanups, enhancements, bug fixes.
1610
1611 - Work on attrib.c::decompress_mapping_pairs() to detect base extents
1612 and setup the runlist appropriately using knowledge provided by the
1613 sizes in the base attribute record.
1614 - Balance the get_/put_attr_search_ctx() calls so we don't leak memory
1615 any more.
1616 - Introduce ntfs_malloc_nofs() and ntfs_free() to allocate/free a single
1617 page or use vmalloc depending on the amount of memory requested.
1618 - Cleanup error output. The __FUNCTION__ "(): " is now added
1619 automatically. Introduced a new header file debug.h to support this
1620 and also moved ntfs_debug() function into it.
1621 - Make reading of compressed files more intelligent and especially get
1622 rid of the vmalloc_nofs() from readpage(). This now uses per CPU
1623 buffers (allocated at first mount with cluster size <= 4kiB and
1624 deallocated on last umount with cluster size <= 4kiB), and
1625 asynchronous io for the compressed data using a list of buffer heads.
1626 Er, we use synchronous io as async io only works on whole pages
1627 covered by buffers and not on individual buffer heads...
1628 - Bug fix for reading compressed files with sparse compression blocks.
1629
1630tng-0.0.2 - Now handles larger/fragmented/compressed volumes/files/dirs.
1631
1632 - Fixed handling of directories when cluster size exceeds index block
1633 size.
1634 - Hide DOS only name space directory entries from readdir() but allow
1635 them in lookup(). This should fix the problem that Linux doesn't
1636 support directory hard links, while still allowing access to entries
1637 via their short file name. This also has the benefit of mimicking
1638 what Windows users are used to, so it is the ideal solution.
1639 - Implemented sync_page everywhere so no more hangs in D state when
1640 waiting for a page.
1641 - Stop using bforget() in favour of brelse().
1642 - Stop locking buffers unnecessarily.
1643 - Implemented compressed files (inode->mapping contains uncompressed
1644 data, raw compressed data is currently bread() into a vmalloc()ed
1645 memory buffer).
1646 - Enable compressed directories. (Their index root is marked compressed
1647 to indicate that new files in that directory should be created
1648 compressed.)
1649 - Use vsnprintf rather than vsprintf in the ntfs_error and ntfs_warning
1650 functions. (Thanks to Will Dyson for pointing this out.)
1651 - Moved the ntfs_inode and ntfs_volume (the former ntfs_inode_info and
1652 ntfs_sb_info) out of the common inode and super_block structures and
1653 started using the generic_ip and generic_sbp pointers instead. This
1654 makes ntfs entirely private with respect to the kernel tree.
1655 - Detect compiler version and abort with error message if gcc less than
1656 2.96 is used.
1657 - Fix bug in name comparison function in unistr.c.
1658 - Implement attribute lists part 1, the infrastructure: search contexts
1659 and operations, find_external_attr(), lookup_attr()) and make the
1660 code use the infrastructure.
1661 - Fix stupid buffer overflow bug that became apparent on larger run
1662 list containing attributes.
1663 - Fix bugs in readdir() that became apparent on larger directories.
1664
1665 The driver is now really useful and survives the test
1666 find . -type f -exec md5sum "{}" \;
1667 without any error messages on a over 1GiB sized partition with >16k
1668 files on it, including compressed files and directories and many files
1669 and directories with attribute lists.
1670
1671tng-0.0.1 - The first useful version.
1672
1673 - Added ntfs_lookup().
1674 - Added default upcase generation and handling.
1675 - Added compile options to be shown on module init.
1676 - Many bug fixes that were "hidden" before.
1677 - Update to latest kernel.
1678 - Added ntfs_readdir().
1679 - Added file operations for mmap(), read(), open() and llseek(). We just
1680 use the generic ones. The whole point of going through implementing
1681 readpage() methods and where possible get_block() call backs is that
1682 this allows us to make use of the generic high level methods provided
1683 by the kernel.
1684
1685 The driver is now actually useful! Yey. (-: It undoubtedly has got bugs
1686 though and it doesn't implement accesssing compressed files yet. Also,
1687 accessing files with attribute list attributes is not implemented yet
1688 either. But for small or simple filesystems it should work and allow
1689 you to list directories, use stat on directory entries and the file
1690 system, open, read, mmap and llseek around in files. A big mile stone
1691 has been reached!
1692
1693tng-0.0.0 - Initial version tag.
1694
1695 Initial driver implementation. The driver can mount and umount simple
1696 NTFS filesystems (i.e. ones without attribute lists in the system
1697 files). If the mount fails there might be problems in the error handling
1698 code paths, so be warned. Otherwise it seems to be loading the system
1699 files nicely and the mft record read mapping/unmapping seems to be
1700 working nicely, too. Proof of inode metadata in the page cache and non-
1701 resident file unnamed stream data in the page cache concepts is thus
1702 complete.
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cfce53cb65d7..c3c2c7ac9020 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -23,6 +23,7 @@
23 23
24#include <linux/errno.h> 24#include <linux/errno.h>
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/gfp.h>
26#include <linux/mm.h> 27#include <linux/mm.h>
27#include <linux/pagemap.h> 28#include <linux/pagemap.h>
28#include <linux/swap.h> 29#include <linux/swap.h>
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 50d3b0c258e3..f5094ee224c1 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25#include <linux/swap.h> 26#include <linux/swap.h>
26#include <linux/writeback.h> 27#include <linux/writeback.h>
27 28
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 08f7530e9341..6551c7cbad92 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -25,6 +25,7 @@
25#include <linux/buffer_head.h> 25#include <linux/buffer_head.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
28#include <linux/slab.h>
28 29
29#include "attrib.h" 30#include "attrib.h"
30#include "inode.h" 31#include "inode.h"
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 5a9e34475e37..fe44d3feee4a 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/slab.h>
24 25
25#include "dir.h" 26#include "dir.h"
26#include "aops.h" 27#include "aops.h"
@@ -1545,7 +1546,7 @@ static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
1545 write_inode_now(bmp_vi, !datasync); 1546 write_inode_now(bmp_vi, !datasync);
1546 iput(bmp_vi); 1547 iput(bmp_vi);
1547 } 1548 }
1548 ret = ntfs_write_inode(vi, 1); 1549 ret = __ntfs_write_inode(vi, 1);
1549 write_inode_now(vi, !datasync); 1550 write_inode_now(vi, !datasync);
1550 err = sync_blockdev(vi->i_sb->s_bdev); 1551 err = sync_blockdev(vi->i_sb->s_bdev);
1551 if (unlikely(err && !ret)) 1552 if (unlikely(err && !ret))
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 43179ddd336f..8804f093ba75 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/gfp.h>
23#include <linux/pagemap.h> 24#include <linux/pagemap.h>
24#include <linux/pagevec.h> 25#include <linux/pagevec.h>
25#include <linux/sched.h> 26#include <linux/sched.h>
@@ -2182,7 +2183,7 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
2182 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); 2183 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
2183 BUG_ON(S_ISDIR(vi->i_mode)); 2184 BUG_ON(S_ISDIR(vi->i_mode));
2184 if (!datasync || !NInoNonResident(NTFS_I(vi))) 2185 if (!datasync || !NInoNonResident(NTFS_I(vi)))
2185 ret = ntfs_write_inode(vi, 1); 2186 ret = __ntfs_write_inode(vi, 1);
2186 write_inode_now(vi, !datasync); 2187 write_inode_now(vi, !datasync);
2187 /* 2188 /*
2188 * NOTE: If we were to use mapping->private_list (see ext2 and 2189 * NOTE: If we were to use mapping->private_list (see ext2 and
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 2194eff49743..096c135691ae 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -19,6 +19,8 @@
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#include <linux/slab.h>
23
22#include "aops.h" 24#include "aops.h"
23#include "collate.h" 25#include "collate.h"
24#include "debug.h" 26#include "debug.h"
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index dc2505abb6d7..4b57fb1eac2a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2957,7 +2957,7 @@ out:
2957 * 2957 *
2958 * Return 0 on success and -errno on error. 2958 * Return 0 on success and -errno on error.
2959 */ 2959 */
2960int ntfs_write_inode(struct inode *vi, int sync) 2960int __ntfs_write_inode(struct inode *vi, int sync)
2961{ 2961{
2962 sle64 nt; 2962 sle64 nt;
2963 ntfs_inode *ni = NTFS_I(vi); 2963 ntfs_inode *ni = NTFS_I(vi);
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 117eaf8032a3..9a113544605d 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -307,12 +307,12 @@ extern void ntfs_truncate_vfs(struct inode *vi);
307 307
308extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr); 308extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr);
309 309
310extern int ntfs_write_inode(struct inode *vi, int sync); 310extern int __ntfs_write_inode(struct inode *vi, int sync);
311 311
312static inline void ntfs_commit_inode(struct inode *vi) 312static inline void ntfs_commit_inode(struct inode *vi)
313{ 313{
314 if (!is_bad_inode(vi)) 314 if (!is_bad_inode(vi))
315 ntfs_write_inode(vi, 1); 315 __ntfs_write_inode(vi, 1);
316 return; 316 return;
317} 317}
318 318
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 1caa0ef0b2bb..b572b6727181 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/slab.h>
24#include <linux/swap.h> 25#include <linux/swap.h>
25 26
26#include "attrib.h" 27#include "attrib.h"
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 2ca00153b6ec..358273e59ade 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -23,6 +23,7 @@
23#include <linux/dcache.h> 23#include <linux/dcache.h>
24#include <linux/exportfs.h> 24#include <linux/exportfs.h>
25#include <linux/security.h> 25#include <linux/security.h>
26#include <linux/slab.h>
26 27
27#include "attrib.h" 28#include "attrib.h"
28#include "debug.h" 29#include "debug.h"
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 80b04770e8e9..0de1db6cddbf 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -31,6 +31,7 @@
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/moduleparam.h> 32#include <linux/moduleparam.h>
33#include <linux/smp_lock.h> 33#include <linux/smp_lock.h>
34#include <linux/bitmap.h>
34 35
35#include "sysctl.h" 36#include "sysctl.h"
36#include "logfile.h" 37#include "logfile.h"
@@ -39,6 +40,7 @@
39#include "dir.h" 40#include "dir.h"
40#include "debug.h" 41#include "debug.h"
41#include "index.h" 42#include "index.h"
43#include "inode.h"
42#include "aops.h" 44#include "aops.h"
43#include "layout.h" 45#include "layout.h"
44#include "malloc.h" 46#include "malloc.h"
@@ -2457,7 +2459,6 @@ static void ntfs_put_super(struct super_block *sb)
2457static s64 get_nr_free_clusters(ntfs_volume *vol) 2459static s64 get_nr_free_clusters(ntfs_volume *vol)
2458{ 2460{
2459 s64 nr_free = vol->nr_clusters; 2461 s64 nr_free = vol->nr_clusters;
2460 u32 *kaddr;
2461 struct address_space *mapping = vol->lcnbmp_ino->i_mapping; 2462 struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
2462 struct page *page; 2463 struct page *page;
2463 pgoff_t index, max_index; 2464 pgoff_t index, max_index;
@@ -2476,7 +2477,8 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2476 ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.", 2477 ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
2477 max_index, PAGE_CACHE_SIZE / 4); 2478 max_index, PAGE_CACHE_SIZE / 4);
2478 for (index = 0; index < max_index; index++) { 2479 for (index = 0; index < max_index; index++) {
2479 unsigned int i; 2480 unsigned long *kaddr;
2481
2480 /* 2482 /*
2481 * Read the page from page cache, getting it from backing store 2483 * Read the page from page cache, getting it from backing store
2482 * if necessary, and increment the use count. 2484 * if necessary, and increment the use count.
@@ -2489,16 +2491,16 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2489 nr_free -= PAGE_CACHE_SIZE * 8; 2491 nr_free -= PAGE_CACHE_SIZE * 8;
2490 continue; 2492 continue;
2491 } 2493 }
2492 kaddr = (u32*)kmap_atomic(page, KM_USER0); 2494 kaddr = kmap_atomic(page, KM_USER0);
2493 /* 2495 /*
2494 * For each 4 bytes, subtract the number of set bits. If this 2496 * Subtract the number of set bits. If this
2495 * is the last page and it is partial we don't really care as 2497 * is the last page and it is partial we don't really care as
2496 * it just means we do a little extra work but it won't affect 2498 * it just means we do a little extra work but it won't affect
2497 * the result as all out of range bytes are set to zero by 2499 * the result as all out of range bytes are set to zero by
2498 * ntfs_readpage(). 2500 * ntfs_readpage().
2499 */ 2501 */
2500 for (i = 0; i < PAGE_CACHE_SIZE / 4; i++) 2502 nr_free -= bitmap_weight(kaddr,
2501 nr_free -= (s64)hweight32(kaddr[i]); 2503 PAGE_CACHE_SIZE * BITS_PER_BYTE);
2502 kunmap_atomic(kaddr, KM_USER0); 2504 kunmap_atomic(kaddr, KM_USER0);
2503 page_cache_release(page); 2505 page_cache_release(page);
2504 } 2506 }
@@ -2537,7 +2539,6 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2537static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, 2539static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2538 s64 nr_free, const pgoff_t max_index) 2540 s64 nr_free, const pgoff_t max_index)
2539{ 2541{
2540 u32 *kaddr;
2541 struct address_space *mapping = vol->mftbmp_ino->i_mapping; 2542 struct address_space *mapping = vol->mftbmp_ino->i_mapping;
2542 struct page *page; 2543 struct page *page;
2543 pgoff_t index; 2544 pgoff_t index;
@@ -2547,7 +2548,8 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2547 ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = " 2548 ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
2548 "0x%lx.", max_index, PAGE_CACHE_SIZE / 4); 2549 "0x%lx.", max_index, PAGE_CACHE_SIZE / 4);
2549 for (index = 0; index < max_index; index++) { 2550 for (index = 0; index < max_index; index++) {
2550 unsigned int i; 2551 unsigned long *kaddr;
2552
2551 /* 2553 /*
2552 * Read the page from page cache, getting it from backing store 2554 * Read the page from page cache, getting it from backing store
2553 * if necessary, and increment the use count. 2555 * if necessary, and increment the use count.
@@ -2560,16 +2562,16 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2560 nr_free -= PAGE_CACHE_SIZE * 8; 2562 nr_free -= PAGE_CACHE_SIZE * 8;
2561 continue; 2563 continue;
2562 } 2564 }
2563 kaddr = (u32*)kmap_atomic(page, KM_USER0); 2565 kaddr = kmap_atomic(page, KM_USER0);
2564 /* 2566 /*
2565 * For each 4 bytes, subtract the number of set bits. If this 2567 * Subtract the number of set bits. If this
2566 * is the last page and it is partial we don't really care as 2568 * is the last page and it is partial we don't really care as
2567 * it just means we do a little extra work but it won't affect 2569 * it just means we do a little extra work but it won't affect
2568 * the result as all out of range bytes are set to zero by 2570 * the result as all out of range bytes are set to zero by
2569 * ntfs_readpage(). 2571 * ntfs_readpage().
2570 */ 2572 */
2571 for (i = 0; i < PAGE_CACHE_SIZE / 4; i++) 2573 nr_free -= bitmap_weight(kaddr,
2572 nr_free -= (s64)hweight32(kaddr[i]); 2574 PAGE_CACHE_SIZE * BITS_PER_BYTE);
2573 kunmap_atomic(kaddr, KM_USER0); 2575 kunmap_atomic(kaddr, KM_USER0);
2574 page_cache_release(page); 2576 page_cache_release(page);
2575 } 2577 }
@@ -2662,6 +2664,13 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
2662 return 0; 2664 return 0;
2663} 2665}
2664 2666
2667#ifdef NTFS_RW
2668static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc)
2669{
2670 return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL);
2671}
2672#endif
2673
2665/** 2674/**
2666 * The complete super operations. 2675 * The complete super operations.
2667 */ 2676 */
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 600d2d2ade11..791c0886c060 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -46,6 +46,7 @@ ocfs2_stackglue-objs := stackglue.o
46ocfs2_stack_o2cb-objs := stack_o2cb.o 46ocfs2_stack_o2cb-objs := stack_o2cb.o
47ocfs2_stack_user-objs := stack_user.o 47ocfs2_stack_user-objs := stack_user.o
48 48
49obj-$(CONFIG_OCFS2_FS) += dlmfs/
49# cluster/ is always needed when OCFS2_FS for masklog support 50# cluster/ is always needed when OCFS2_FS for masklog support
50obj-$(CONFIG_OCFS2_FS) += cluster/ 51obj-$(CONFIG_OCFS2_FS) += cluster/
51obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/ 52obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0501974bedd0..e13fc9e8fcdc 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -21,6 +21,7 @@
21 21
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/slab.h>
24#include <linux/string.h> 25#include <linux/string.h>
25 26
26#define MLOG_MASK_PREFIX ML_INODE 27#define MLOG_MASK_PREFIX ML_INODE
@@ -30,6 +31,8 @@
30#include "alloc.h" 31#include "alloc.h"
31#include "dlmglue.h" 32#include "dlmglue.h"
32#include "file.h" 33#include "file.h"
34#include "inode.h"
35#include "journal.h"
33#include "ocfs2_fs.h" 36#include "ocfs2_fs.h"
34 37
35#include "xattr.h" 38#include "xattr.h"
@@ -166,6 +169,60 @@ static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
166} 169}
167 170
168/* 171/*
172 * Helper function to set i_mode in memory and disk. Some call paths
173 * will not have di_bh or a journal handle to pass, in which case it
174 * will create it's own.
175 */
176static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
177 handle_t *handle, umode_t new_mode)
178{
179 int ret, commit_handle = 0;
180 struct ocfs2_dinode *di;
181
182 if (di_bh == NULL) {
183 ret = ocfs2_read_inode_block(inode, &di_bh);
184 if (ret) {
185 mlog_errno(ret);
186 goto out;
187 }
188 } else
189 get_bh(di_bh);
190
191 if (handle == NULL) {
192 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
193 OCFS2_INODE_UPDATE_CREDITS);
194 if (IS_ERR(handle)) {
195 ret = PTR_ERR(handle);
196 mlog_errno(ret);
197 goto out_brelse;
198 }
199
200 commit_handle = 1;
201 }
202
203 di = (struct ocfs2_dinode *)di_bh->b_data;
204 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
205 OCFS2_JOURNAL_ACCESS_WRITE);
206 if (ret) {
207 mlog_errno(ret);
208 goto out_commit;
209 }
210
211 inode->i_mode = new_mode;
212 di->i_mode = cpu_to_le16(inode->i_mode);
213
214 ocfs2_journal_dirty(handle, di_bh);
215
216out_commit:
217 if (commit_handle)
218 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
219out_brelse:
220 brelse(di_bh);
221out:
222 return ret;
223}
224
225/*
169 * Set the access or default ACL of an inode. 226 * Set the access or default ACL of an inode.
170 */ 227 */
171static int ocfs2_set_acl(handle_t *handle, 228static int ocfs2_set_acl(handle_t *handle,
@@ -193,9 +250,14 @@ static int ocfs2_set_acl(handle_t *handle,
193 if (ret < 0) 250 if (ret < 0)
194 return ret; 251 return ret;
195 else { 252 else {
196 inode->i_mode = mode;
197 if (ret == 0) 253 if (ret == 0)
198 acl = NULL; 254 acl = NULL;
255
256 ret = ocfs2_acl_set_mode(inode, di_bh,
257 handle, mode);
258 if (ret)
259 return ret;
260
199 } 261 }
200 } 262 }
201 break; 263 break;
@@ -283,6 +345,7 @@ int ocfs2_init_acl(handle_t *handle,
283 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 345 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
284 struct posix_acl *acl = NULL; 346 struct posix_acl *acl = NULL;
285 int ret = 0; 347 int ret = 0;
348 mode_t mode;
286 349
287 if (!S_ISLNK(inode->i_mode)) { 350 if (!S_ISLNK(inode->i_mode)) {
288 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { 351 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
@@ -291,12 +354,17 @@ int ocfs2_init_acl(handle_t *handle,
291 if (IS_ERR(acl)) 354 if (IS_ERR(acl))
292 return PTR_ERR(acl); 355 return PTR_ERR(acl);
293 } 356 }
294 if (!acl) 357 if (!acl) {
295 inode->i_mode &= ~current_umask(); 358 mode = inode->i_mode & ~current_umask();
359 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
360 if (ret) {
361 mlog_errno(ret);
362 goto cleanup;
363 }
364 }
296 } 365 }
297 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { 366 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
298 struct posix_acl *clone; 367 struct posix_acl *clone;
299 mode_t mode;
300 368
301 if (S_ISDIR(inode->i_mode)) { 369 if (S_ISDIR(inode->i_mode)) {
302 ret = ocfs2_set_acl(handle, inode, di_bh, 370 ret = ocfs2_set_acl(handle, inode, di_bh,
@@ -313,7 +381,7 @@ int ocfs2_init_acl(handle_t *handle,
313 mode = inode->i_mode; 381 mode = inode->i_mode;
314 ret = posix_acl_create_masq(clone, &mode); 382 ret = posix_acl_create_masq(clone, &mode);
315 if (ret >= 0) { 383 if (ret >= 0) {
316 inode->i_mode = mode; 384 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
317 if (ret > 0) { 385 if (ret > 0) {
318 ret = ocfs2_set_acl(handle, inode, 386 ret = ocfs2_set_acl(handle, inode,
319 di_bh, ACL_TYPE_ACCESS, 387 di_bh, ACL_TYPE_ACCESS,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d17bdc718f74..9f8bd913c51e 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1050,7 +1050,8 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1050 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 1050 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
1051 eb->h_blkno = cpu_to_le64(first_blkno); 1051 eb->h_blkno = cpu_to_le64(first_blkno);
1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1053 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); 1053 eb->h_suballoc_slot =
1054 cpu_to_le16(meta_ac->ac_alloc_slot);
1054 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1055 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1055 eb->h_list.l_count = 1056 eb->h_list.l_count =
1056 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 1057 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -5712,7 +5713,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5712 goto out; 5713 goto out;
5713 } 5714 }
5714 5715
5715 vfs_dq_free_space_nodirty(inode, 5716 dquot_free_space_nodirty(inode,
5716 ocfs2_clusters_to_bytes(inode->i_sb, len)); 5717 ocfs2_clusters_to_bytes(inode->i_sb, len));
5717 5718
5718 ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc); 5719 ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
@@ -6037,7 +6038,7 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
6037 if (status < 0) 6038 if (status < 0)
6038 mlog_errno(status); 6039 mlog_errno(status);
6039 else 6040 else
6040 ocfs2_init_inode_steal_slot(osb); 6041 ocfs2_init_steal_slots(osb);
6041 6042
6042 mlog_exit(status); 6043 mlog_exit(status);
6043} 6044}
@@ -6935,7 +6936,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6935 goto bail; 6936 goto bail;
6936 } 6937 }
6937 6938
6938 vfs_dq_free_space_nodirty(inode, 6939 dquot_free_space_nodirty(inode,
6939 ocfs2_clusters_to_bytes(osb->sb, clusters_to_del)); 6940 ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6940 spin_lock(&OCFS2_I(inode)->ip_lock); 6941 spin_lock(&OCFS2_I(inode)->ip_lock);
6941 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - 6942 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
@@ -7300,11 +7301,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7300 unsigned int page_end; 7301 unsigned int page_end;
7301 u64 phys; 7302 u64 phys;
7302 7303
7303 if (vfs_dq_alloc_space_nodirty(inode, 7304 ret = dquot_alloc_space_nodirty(inode,
7304 ocfs2_clusters_to_bytes(osb->sb, 1))) { 7305 ocfs2_clusters_to_bytes(osb->sb, 1));
7305 ret = -EDQUOT; 7306 if (ret)
7306 goto out_commit; 7307 goto out_commit;
7307 }
7308 did_quota = 1; 7308 did_quota = 1;
7309 7309
7310 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 7310 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
@@ -7380,7 +7380,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7380 7380
7381out_commit: 7381out_commit:
7382 if (ret < 0 && did_quota) 7382 if (ret < 0 && did_quota)
7383 vfs_dq_free_space_nodirty(inode, 7383 dquot_free_space_nodirty(inode,
7384 ocfs2_clusters_to_bytes(osb->sb, 1)); 7384 ocfs2_clusters_to_bytes(osb->sb, 1));
7385 7385
7386 ocfs2_commit_trans(osb, handle); 7386 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3dae4a13f6e4..21441ddb5506 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -577,8 +577,9 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
577 goto bail; 577 goto bail;
578 } 578 }
579 579
580 /* We should already CoW the refcounted extent. */ 580 /* We should already CoW the refcounted extent in case of create. */
581 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); 581 BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
582
582 /* 583 /*
583 * get_more_blocks() expects us to describe a hole by clearing 584 * get_more_blocks() expects us to describe a hole by clearing
584 * the mapped bit on bh_result(). 585 * the mapped bit on bh_result().
@@ -599,7 +600,7 @@ bail:
599 return ret; 600 return ret;
600} 601}
601 602
602/* 603/*
603 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're 604 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
604 * particularly interested in the aio/dio case. Like the core uses 605 * particularly interested in the aio/dio case. Like the core uses
605 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from 606 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
@@ -670,7 +671,7 @@ static ssize_t ocfs2_direct_IO(int rw,
670 671
671 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 672 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
672 inode->i_sb->s_bdev, iov, offset, 673 inode->i_sb->s_bdev, iov, offset,
673 nr_segs, 674 nr_segs,
674 ocfs2_direct_IO_get_blocks, 675 ocfs2_direct_IO_get_blocks,
675 ocfs2_dio_end_io); 676 ocfs2_dio_end_io);
676 677
@@ -1763,10 +1764,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1763 1764
1764 wc->w_handle = handle; 1765 wc->w_handle = handle;
1765 1766
1766 if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode, 1767 if (clusters_to_alloc) {
1767 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) { 1768 ret = dquot_alloc_space_nodirty(inode,
1768 ret = -EDQUOT; 1769 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
1769 goto out_commit; 1770 if (ret)
1771 goto out_commit;
1770 } 1772 }
1771 /* 1773 /*
1772 * We don't want this to fail in ocfs2_write_end(), so do it 1774 * We don't want this to fail in ocfs2_write_end(), so do it
@@ -1809,7 +1811,7 @@ success:
1809 return 0; 1811 return 0;
1810out_quota: 1812out_quota:
1811 if (clusters_to_alloc) 1813 if (clusters_to_alloc)
1812 vfs_dq_free_space(inode, 1814 dquot_free_space(inode,
1813 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); 1815 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
1814out_commit: 1816out_commit:
1815 ocfs2_commit_trans(osb, handle); 1817 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index d43d34a1dd31..ecebb2276790 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30 29
31#include <cluster/masklog.h> 30#include <cluster/masklog.h>
@@ -368,7 +367,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
368 } 367 }
369 ocfs2_metadata_cache_io_unlock(ci); 368 ocfs2_metadata_cache_io_unlock(ci);
370 369
371 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 370 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
372 (unsigned long long)block, nr, 371 (unsigned long long)block, nr,
373 ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes", 372 ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
374 flags); 373 flags);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index eda5b8bcddd5..41d5f1f92d56 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -34,6 +34,7 @@
34#include <linux/crc32.h> 34#include <linux/crc32.h>
35#include <linux/time.h> 35#include <linux/time.h>
36#include <linux/debugfs.h> 36#include <linux/debugfs.h>
37#include <linux/slab.h>
37 38
38#include "heartbeat.h" 39#include "heartbeat.h"
39#include "tcp.h" 40#include "tcp.h"
@@ -78,7 +79,7 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
78 79
79unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; 80unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
80 81
81/* Only sets a new threshold if there are no active regions. 82/* Only sets a new threshold if there are no active regions.
82 * 83 *
83 * No locking or otherwise interesting code is required for reading 84 * No locking or otherwise interesting code is required for reading
84 * o2hb_dead_threshold as it can't change once regions are active and 85 * o2hb_dead_threshold as it can't change once regions are active and
@@ -170,7 +171,7 @@ static void o2hb_write_timeout(struct work_struct *work)
170 171
171 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " 172 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
172 "milliseconds\n", reg->hr_dev_name, 173 "milliseconds\n", reg->hr_dev_name,
173 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 174 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
174 o2quo_disk_timeout(); 175 o2quo_disk_timeout();
175} 176}
176 177
@@ -624,7 +625,7 @@ static int o2hb_check_slot(struct o2hb_region *reg,
624 "seq %llu last %llu changed %u equal %u\n", 625 "seq %llu last %llu changed %u equal %u\n",
625 slot->ds_node_num, (long long)slot->ds_last_generation, 626 slot->ds_node_num, (long long)slot->ds_last_generation,
626 le32_to_cpu(hb_block->hb_cksum), 627 le32_to_cpu(hb_block->hb_cksum),
627 (unsigned long long)le64_to_cpu(hb_block->hb_seq), 628 (unsigned long long)le64_to_cpu(hb_block->hb_seq),
628 (unsigned long long)slot->ds_last_time, slot->ds_changed_samples, 629 (unsigned long long)slot->ds_last_time, slot->ds_changed_samples,
629 slot->ds_equal_samples); 630 slot->ds_equal_samples);
630 631
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 1cd2934de615..3bb928a2bf7d 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -112,6 +112,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
112 define_mask(XATTR), 112 define_mask(XATTR),
113 define_mask(QUOTA), 113 define_mask(QUOTA),
114 define_mask(REFCOUNT), 114 define_mask(REFCOUNT),
115 define_mask(BASTS),
115 define_mask(ERROR), 116 define_mask(ERROR),
116 define_mask(NOTICE), 117 define_mask(NOTICE),
117 define_mask(KTHREAD), 118 define_mask(KTHREAD),
@@ -135,7 +136,7 @@ static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
135 return mlog_mask_store(mlog_attr->mask, buf, count); 136 return mlog_mask_store(mlog_attr->mask, buf, count);
136} 137}
137 138
138static struct sysfs_ops mlog_attr_ops = { 139static const struct sysfs_ops mlog_attr_ops = {
139 .show = mlog_show, 140 .show = mlog_show,
140 .store = mlog_store, 141 .store = mlog_store,
141}; 142};
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 9b4d11726cf2..3dfddbec32f2 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -114,6 +114,7 @@
114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ 114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ 115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */ 116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */
117#define ML_BASTS 0x0000001000000000ULL /* dlmglue asts and basts */
117/* bits that are infrequently given and frequently matched in the high word */ 118/* bits that are infrequently given and frequently matched in the high word */
118#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 119#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
119#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 120#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
@@ -194,9 +195,9 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
194 * previous token if args expands to nothing. 195 * previous token if args expands to nothing.
195 */ 196 */
196#define __mlog_printk(level, fmt, args...) \ 197#define __mlog_printk(level, fmt, args...) \
197 printk(level "(%u,%lu):%s:%d " fmt, task_pid_nr(current), \ 198 printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \
198 __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ , \ 199 task_pid_nr(current), __mlog_cpu_guess, \
199 ##args) 200 __PRETTY_FUNCTION__, __LINE__ , ##args)
200 201
201#define mlog(mask, fmt, args...) do { \ 202#define mlog(mask, fmt, args...) do { \
202 u64 __m = MLOG_MASK_PREFIX | (mask); \ 203 u64 __m = MLOG_MASK_PREFIX | (mask); \
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index c81142e3ef84..ed0c9f367fed 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -19,6 +19,7 @@
19 * Boston, MA 021110-1307, USA. 19 * Boston, MA 021110-1307, USA.
20 */ 20 */
21 21
22#include <linux/slab.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/configfs.h> 25#include <linux/configfs.h>
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 639024033fce..cf3e16696216 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -44,7 +44,6 @@
44 * and if they're the last, they fire off the decision. 44 * and if they're the last, they fire off the decision.
45 */ 45 */
46#include <linux/kernel.h> 46#include <linux/kernel.h>
47#include <linux/slab.h>
48#include <linux/workqueue.h> 47#include <linux/workqueue.h>
49#include <linux/reboot.h> 48#include <linux/reboot.h>
50 49
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 334f231a422c..73e743eea2c8 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -72,9 +72,9 @@
72 72
73#include "tcp_internal.h" 73#include "tcp_internal.h"
74 74
75#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u" 75#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u"
76#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \ 76#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \
77 NIPQUAD(sc->sc_node->nd_ipv4_address), \ 77 &sc->sc_node->nd_ipv4_address, \
78 ntohs(sc->sc_node->nd_ipv4_port) 78 ntohs(sc->sc_node->nd_ipv4_port)
79 79
80/* 80/*
@@ -485,7 +485,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
485 } 485 }
486 486
487 if (was_valid && !valid) { 487 if (was_valid && !valid) {
488 printk(KERN_INFO "o2net: no longer connected to " 488 printk(KERN_NOTICE "o2net: no longer connected to "
489 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); 489 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
490 o2net_complete_nodes_nsw(nn); 490 o2net_complete_nodes_nsw(nn);
491 } 491 }
@@ -493,7 +493,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
493 if (!was_valid && valid) { 493 if (!was_valid && valid) {
494 o2quo_conn_up(o2net_num_from_nn(nn)); 494 o2quo_conn_up(o2net_num_from_nn(nn));
495 cancel_delayed_work(&nn->nn_connect_expired); 495 cancel_delayed_work(&nn->nn_connect_expired);
496 printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n", 496 printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
497 o2nm_this_node() > sc->sc_node->nd_num ? 497 o2nm_this_node() > sc->sc_node->nd_num ?
498 "connected to" : "accepted connection from", 498 "connected to" : "accepted connection from",
499 SC_NODEF_ARGS(sc)); 499 SC_NODEF_ARGS(sc));
@@ -930,7 +930,7 @@ static void o2net_sendpage(struct o2net_sock_container *sc,
930 cond_resched(); 930 cond_resched();
931 continue; 931 continue;
932 } 932 }
933 mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT 933 mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT
934 " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret); 934 " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
935 o2net_ensure_shutdown(nn, sc, 0); 935 o2net_ensure_shutdown(nn, sc, 0);
936 break; 936 break;
@@ -1476,14 +1476,14 @@ static void o2net_idle_timer(unsigned long data)
1476 1476
1477 do_gettimeofday(&now); 1477 do_gettimeofday(&now);
1478 1478
1479 printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " 1479 printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
1480 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), 1480 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
1481 o2net_idle_timeout() / 1000, 1481 o2net_idle_timeout() / 1000,
1482 o2net_idle_timeout() % 1000); 1482 o2net_idle_timeout() % 1000);
1483 mlog(ML_NOTICE, "here are some times that might help debug the " 1483 mlog(ML_NOTICE, "here are some times that might help debug the "
1484 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " 1484 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
1485 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", 1485 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
1486 sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec, 1486 sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
1487 now.tv_sec, (long) now.tv_usec, 1487 now.tv_sec, (long) now.tv_usec,
1488 sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec, 1488 sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
1489 sc->sc_tv_advance_start.tv_sec, 1489 sc->sc_tv_advance_start.tv_sec,
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 8d58cfe410b1..96fa7ebc530c 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -32,10 +32,10 @@
32 * on their number */ 32 * on their number */
33#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS) 33#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
34 34
35/* 35/*
36 * This version number represents quite a lot, unfortunately. It not 36 * This version number represents quite a lot, unfortunately. It not
37 * only represents the raw network message protocol on the wire but also 37 * only represents the raw network message protocol on the wire but also
38 * locking semantics of the file system using the protocol. It should 38 * locking semantics of the file system using the protocol. It should
39 * be somewhere else, I'm sure, but right now it isn't. 39 * be somewhere else, I'm sure, but right now it isn't.
40 * 40 *
41 * With version 11, we separate out the filesystem locking portion. The 41 * With version 11, we separate out the filesystem locking portion. The
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 28c3ec238796..efd77d071c80 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2439,7 +2439,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2439 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 2439 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2440 memset(dx_root, 0, osb->sb->s_blocksize); 2440 memset(dx_root, 0, osb->sb->s_blocksize);
2441 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); 2441 strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2442 dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num); 2442 dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
2443 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); 2443 dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2444 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); 2444 dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2445 dx_root->dr_blkno = cpu_to_le64(dr_blkno); 2445 dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2964,12 +2964,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2964 goto out; 2964 goto out;
2965 } 2965 }
2966 2966
2967 if (vfs_dq_alloc_space_nodirty(dir, 2967 ret = dquot_alloc_space_nodirty(dir,
2968 ocfs2_clusters_to_bytes(osb->sb, 2968 ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc));
2969 alloc + dx_alloc))) { 2969 if (ret)
2970 ret = -EDQUOT;
2971 goto out_commit; 2970 goto out_commit;
2972 }
2973 did_quota = 1; 2971 did_quota = 1;
2974 2972
2975 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { 2973 if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
@@ -3178,7 +3176,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3178 3176
3179out_commit: 3177out_commit:
3180 if (ret < 0 && did_quota) 3178 if (ret < 0 && did_quota)
3181 vfs_dq_free_space_nodirty(dir, bytes_allocated); 3179 dquot_free_space_nodirty(dir, bytes_allocated);
3182 3180
3183 ocfs2_commit_trans(osb, handle); 3181 ocfs2_commit_trans(osb, handle);
3184 3182
@@ -3221,11 +3219,10 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
3221 if (extend) { 3219 if (extend) {
3222 u32 offset = OCFS2_I(dir)->ip_clusters; 3220 u32 offset = OCFS2_I(dir)->ip_clusters;
3223 3221
3224 if (vfs_dq_alloc_space_nodirty(dir, 3222 status = dquot_alloc_space_nodirty(dir,
3225 ocfs2_clusters_to_bytes(sb, 1))) { 3223 ocfs2_clusters_to_bytes(sb, 1));
3226 status = -EDQUOT; 3224 if (status)
3227 goto bail; 3225 goto bail;
3228 }
3229 did_quota = 1; 3226 did_quota = 1;
3230 3227
3231 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, 3228 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
@@ -3254,7 +3251,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
3254 status = 0; 3251 status = 0;
3255bail: 3252bail:
3256 if (did_quota && status < 0) 3253 if (did_quota && status < 0)
3257 vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); 3254 dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
3258 mlog_exit(status); 3255 mlog_exit(status);
3259 return status; 3256 return status;
3260} 3257}
@@ -3889,11 +3886,10 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3889 goto out; 3886 goto out;
3890 } 3887 }
3891 3888
3892 if (vfs_dq_alloc_space_nodirty(dir, 3889 ret = dquot_alloc_space_nodirty(dir,
3893 ocfs2_clusters_to_bytes(dir->i_sb, 1))) { 3890 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3894 ret = -EDQUOT; 3891 if (ret)
3895 goto out_commit; 3892 goto out_commit;
3896 }
3897 did_quota = 1; 3893 did_quota = 1;
3898 3894
3899 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, 3895 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
@@ -3983,7 +3979,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3983 3979
3984out_commit: 3980out_commit:
3985 if (ret < 0 && did_quota) 3981 if (ret < 0 && did_quota)
3986 vfs_dq_free_space_nodirty(dir, 3982 dquot_free_space_nodirty(dir,
3987 ocfs2_clusters_to_bytes(dir->i_sb, 1)); 3983 ocfs2_clusters_to_bytes(dir->i_sb, 1));
3988 3984
3989 ocfs2_commit_trans(osb, handle); 3985 ocfs2_commit_trans(osb, handle);
@@ -4165,11 +4161,10 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4165 goto out; 4161 goto out;
4166 } 4162 }
4167 4163
4168 if (vfs_dq_alloc_space_nodirty(dir, 4164 ret = dquot_alloc_space_nodirty(dir,
4169 ocfs2_clusters_to_bytes(osb->sb, 1))) { 4165 ocfs2_clusters_to_bytes(osb->sb, 1));
4170 ret = -EDQUOT; 4166 if (ret)
4171 goto out_commit; 4167 goto out_commit;
4172 }
4173 did_quota = 1; 4168 did_quota = 1;
4174 4169
4175 /* 4170 /*
@@ -4229,7 +4224,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4229 4224
4230out_commit: 4225out_commit:
4231 if (ret < 0 && did_quota) 4226 if (ret < 0 && did_quota)
4232 vfs_dq_free_space_nodirty(dir, 4227 dquot_free_space_nodirty(dir,
4233 ocfs2_clusters_to_bytes(dir->i_sb, 1)); 4228 ocfs2_clusters_to_bytes(dir->i_sb, 1));
4234 4229
4235 ocfs2_commit_trans(osb, handle); 4230 ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index 190361375700..dcebf0d920fa 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,8 +1,7 @@
1EXTRA_CFLAGS += -Ifs/ocfs2 1EXTRA_CFLAGS += -Ifs/ocfs2
2 2
3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o 3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
4 4
5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ 5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o 6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
7 7
8ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
index b5786a787fab..3cfa114aa391 100644
--- a/fs/ocfs2/dlm/dlmapi.h
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -95,7 +95,7 @@ const char *dlm_errname(enum dlm_status err);
95 mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \ 95 mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \
96} while (0) 96} while (0)
97 97
98#define DLM_LKSB_UNUSED1 0x01 98#define DLM_LKSB_UNUSED1 0x01
99#define DLM_LKSB_PUT_LVB 0x02 99#define DLM_LKSB_PUT_LVB 0x02
100#define DLM_LKSB_GET_LVB 0x04 100#define DLM_LKSB_GET_LVB 0x04
101#define DLM_LKSB_UNUSED2 0x08 101#define DLM_LKSB_UNUSED2 0x08
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 01cf8cc3d286..a795eb91f4ea 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -123,7 +122,7 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
123 dlm_lock_put(lock); 122 dlm_lock_put(lock);
124 /* free up the reserved bast that we are cancelling. 123 /* free up the reserved bast that we are cancelling.
125 * guaranteed that this will not be the last reserved 124 * guaranteed that this will not be the last reserved
126 * ast because *both* an ast and a bast were reserved 125 * ast because *both* an ast and a bast were reserved
127 * to get to this point. the res->spinlock will not be 126 * to get to this point. the res->spinlock will not be
128 * taken here */ 127 * taken here */
129 dlm_lockres_release_ast(dlm, res); 128 dlm_lockres_release_ast(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index ca96bce50e18..90803b47cd8c 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -396,7 +395,7 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
396 /* instead of logging the same network error over 395 /* instead of logging the same network error over
397 * and over, sleep here and wait for the heartbeat 396 * and over, sleep here and wait for the heartbeat
398 * to notice the node is dead. times out after 5s. */ 397 * to notice the node is dead. times out after 5s. */
399 dlm_wait_for_node_death(dlm, res->owner, 398 dlm_wait_for_node_death(dlm, res->owner,
400 DLM_NODE_DEATH_WAIT_MAX); 399 DLM_NODE_DEATH_WAIT_MAX);
401 ret = DLM_RECOVERING; 400 ret = DLM_RECOVERING;
402 mlog(0, "node %u died so returning DLM_RECOVERING " 401 mlog(0, "node %u died so returning DLM_RECOVERING "
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 42b0bad7a612..0cd24cf54396 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -102,7 +102,7 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
102 assert_spin_locked(&res->spinlock); 102 assert_spin_locked(&res->spinlock);
103 103
104 stringify_lockname(res->lockname.name, res->lockname.len, 104 stringify_lockname(res->lockname.name, res->lockname.len,
105 buf, sizeof(buf) - 1); 105 buf, sizeof(buf));
106 printk("lockres: %s, owner=%u, state=%u\n", 106 printk("lockres: %s, owner=%u, state=%u\n",
107 buf, res->owner, res->state); 107 buf, res->owner, res->state);
108 printk(" last used: %lu, refcnt: %u, on purge list: %s\n", 108 printk(" last used: %lu, refcnt: %u, on purge list: %s\n",
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 0334000676d3..988c9055fd4e 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -816,7 +816,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
816 } 816 }
817 817
818 /* Once the dlm ctxt is marked as leaving then we don't want 818 /* Once the dlm ctxt is marked as leaving then we don't want
819 * to be put in someone's domain map. 819 * to be put in someone's domain map.
820 * Also, explicitly disallow joining at certain troublesome 820 * Also, explicitly disallow joining at certain troublesome
821 * times (ie. during recovery). */ 821 * times (ie. during recovery). */
822 if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { 822 if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 437698e9465f..733337772671 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -269,7 +269,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
269 } 269 }
270 dlm_revert_pending_lock(res, lock); 270 dlm_revert_pending_lock(res, lock);
271 dlm_lock_put(lock); 271 dlm_lock_put(lock);
272 } else if (dlm_is_recovery_lock(res->lockname.name, 272 } else if (dlm_is_recovery_lock(res->lockname.name,
273 res->lockname.len)) { 273 res->lockname.len)) {
274 /* special case for the $RECOVERY lock. 274 /* special case for the $RECOVERY lock.
275 * there will never be an AST delivered to put 275 * there will never be an AST delivered to put
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 03ccf9a7b1f4..9289b4357d27 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -366,7 +366,7 @@ void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
366 struct dlm_master_list_entry *mle; 366 struct dlm_master_list_entry *mle;
367 367
368 assert_spin_locked(&dlm->spinlock); 368 assert_spin_locked(&dlm->spinlock);
369 369
370 list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) { 370 list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
371 if (node_up) 371 if (node_up)
372 dlm_mle_node_up(dlm, mle, NULL, idx); 372 dlm_mle_node_up(dlm, mle, NULL, idx);
@@ -833,7 +833,7 @@ lookup:
833 __dlm_insert_mle(dlm, mle); 833 __dlm_insert_mle(dlm, mle);
834 834
835 /* still holding the dlm spinlock, check the recovery map 835 /* still holding the dlm spinlock, check the recovery map
836 * to see if there are any nodes that still need to be 836 * to see if there are any nodes that still need to be
837 * considered. these will not appear in the mle nodemap 837 * considered. these will not appear in the mle nodemap
838 * but they might own this lockres. wait on them. */ 838 * but they might own this lockres. wait on them. */
839 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 839 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
@@ -883,7 +883,7 @@ redo_request:
883 msleep(500); 883 msleep(500);
884 } 884 }
885 continue; 885 continue;
886 } 886 }
887 887
888 dlm_kick_recovery_thread(dlm); 888 dlm_kick_recovery_thread(dlm);
889 msleep(1000); 889 msleep(1000);
@@ -939,8 +939,8 @@ wait:
939 res->lockname.name, blocked); 939 res->lockname.name, blocked);
940 if (++tries > 20) { 940 if (++tries > 20) {
941 mlog(ML_ERROR, "%s:%.*s: spinning on " 941 mlog(ML_ERROR, "%s:%.*s: spinning on "
942 "dlm_wait_for_lock_mastery, blocked=%d\n", 942 "dlm_wait_for_lock_mastery, blocked=%d\n",
943 dlm->name, res->lockname.len, 943 dlm->name, res->lockname.len,
944 res->lockname.name, blocked); 944 res->lockname.name, blocked);
945 dlm_print_one_lock_resource(res); 945 dlm_print_one_lock_resource(res);
946 dlm_print_one_mle(mle); 946 dlm_print_one_mle(mle);
@@ -1029,7 +1029,7 @@ recheck:
1029 ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked); 1029 ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
1030 b = (mle->type == DLM_MLE_BLOCK); 1030 b = (mle->type == DLM_MLE_BLOCK);
1031 if ((*blocked && !b) || (!*blocked && b)) { 1031 if ((*blocked && !b) || (!*blocked && b)) {
1032 mlog(0, "%s:%.*s: status change: old=%d new=%d\n", 1032 mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
1033 dlm->name, res->lockname.len, res->lockname.name, 1033 dlm->name, res->lockname.len, res->lockname.name,
1034 *blocked, b); 1034 *blocked, b);
1035 *blocked = b; 1035 *blocked = b;
@@ -1602,7 +1602,7 @@ send_response:
1602 } 1602 }
1603 mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", 1603 mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
1604 dlm->node_num, res->lockname.len, res->lockname.name); 1604 dlm->node_num, res->lockname.len, res->lockname.name);
1605 ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, 1605 ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
1606 DLM_ASSERT_MASTER_MLE_CLEANUP); 1606 DLM_ASSERT_MASTER_MLE_CLEANUP);
1607 if (ret < 0) { 1607 if (ret < 0) {
1608 mlog(ML_ERROR, "failed to dispatch assert master work\n"); 1608 mlog(ML_ERROR, "failed to dispatch assert master work\n");
@@ -1701,7 +1701,7 @@ again:
1701 1701
1702 if (r & DLM_ASSERT_RESPONSE_REASSERT) { 1702 if (r & DLM_ASSERT_RESPONSE_REASSERT) {
1703 mlog(0, "%.*s: node %u create mles on other " 1703 mlog(0, "%.*s: node %u create mles on other "
1704 "nodes and requests a re-assert\n", 1704 "nodes and requests a re-assert\n",
1705 namelen, lockname, to); 1705 namelen, lockname, to);
1706 reassert = 1; 1706 reassert = 1;
1707 } 1707 }
@@ -1812,7 +1812,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1812 spin_unlock(&dlm->master_lock); 1812 spin_unlock(&dlm->master_lock);
1813 spin_unlock(&dlm->spinlock); 1813 spin_unlock(&dlm->spinlock);
1814 goto done; 1814 goto done;
1815 } 1815 }
1816 } 1816 }
1817 } 1817 }
1818 spin_unlock(&dlm->master_lock); 1818 spin_unlock(&dlm->master_lock);
@@ -1875,7 +1875,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1875ok: 1875ok:
1876 spin_unlock(&res->spinlock); 1876 spin_unlock(&res->spinlock);
1877 } 1877 }
1878 spin_unlock(&dlm->spinlock);
1879 1878
1880 // mlog(0, "woo! got an assert_master from node %u!\n", 1879 // mlog(0, "woo! got an assert_master from node %u!\n",
1881 // assert->node_idx); 1880 // assert->node_idx);
@@ -1883,7 +1882,7 @@ ok:
1883 int extra_ref = 0; 1882 int extra_ref = 0;
1884 int nn = -1; 1883 int nn = -1;
1885 int rr, err = 0; 1884 int rr, err = 0;
1886 1885
1887 spin_lock(&mle->spinlock); 1886 spin_lock(&mle->spinlock);
1888 if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) 1887 if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
1889 extra_ref = 1; 1888 extra_ref = 1;
@@ -1891,7 +1890,7 @@ ok:
1891 /* MASTER mle: if any bits set in the response map 1890 /* MASTER mle: if any bits set in the response map
1892 * then the calling node needs to re-assert to clear 1891 * then the calling node needs to re-assert to clear
1893 * up nodes that this node contacted */ 1892 * up nodes that this node contacted */
1894 while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, 1893 while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
1895 nn+1)) < O2NM_MAX_NODES) { 1894 nn+1)) < O2NM_MAX_NODES) {
1896 if (nn != dlm->node_num && nn != assert->node_idx) 1895 if (nn != dlm->node_num && nn != assert->node_idx)
1897 master_request = 1; 1896 master_request = 1;
@@ -1926,7 +1925,6 @@ ok:
1926 /* master is known, detach if not already detached. 1925 /* master is known, detach if not already detached.
1927 * ensures that only one assert_master call will happen 1926 * ensures that only one assert_master call will happen
1928 * on this mle. */ 1927 * on this mle. */
1929 spin_lock(&dlm->spinlock);
1930 spin_lock(&dlm->master_lock); 1928 spin_lock(&dlm->master_lock);
1931 1929
1932 rr = atomic_read(&mle->mle_refs.refcount); 1930 rr = atomic_read(&mle->mle_refs.refcount);
@@ -1959,7 +1957,6 @@ ok:
1959 __dlm_put_mle(mle); 1957 __dlm_put_mle(mle);
1960 } 1958 }
1961 spin_unlock(&dlm->master_lock); 1959 spin_unlock(&dlm->master_lock);
1962 spin_unlock(&dlm->spinlock);
1963 } else if (res) { 1960 } else if (res) {
1964 if (res->owner != assert->node_idx) { 1961 if (res->owner != assert->node_idx) {
1965 mlog(0, "assert_master from %u, but current " 1962 mlog(0, "assert_master from %u, but current "
@@ -1967,6 +1964,7 @@ ok:
1967 res->owner, namelen, name); 1964 res->owner, namelen, name);
1968 } 1965 }
1969 } 1966 }
1967 spin_unlock(&dlm->spinlock);
1970 1968
1971done: 1969done:
1972 ret = 0; 1970 ret = 0;
@@ -2002,7 +2000,7 @@ kill:
2002 __dlm_print_one_lock_resource(res); 2000 __dlm_print_one_lock_resource(res);
2003 spin_unlock(&res->spinlock); 2001 spin_unlock(&res->spinlock);
2004 spin_unlock(&dlm->spinlock); 2002 spin_unlock(&dlm->spinlock);
2005 *ret_data = (void *)res; 2003 *ret_data = (void *)res;
2006 dlm_put(dlm); 2004 dlm_put(dlm);
2007 return -EINVAL; 2005 return -EINVAL;
2008} 2006}
@@ -2040,10 +2038,10 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
2040 item->u.am.request_from = request_from; 2038 item->u.am.request_from = request_from;
2041 item->u.am.flags = flags; 2039 item->u.am.flags = flags;
2042 2040
2043 if (ignore_higher) 2041 if (ignore_higher)
2044 mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len, 2042 mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
2045 res->lockname.name); 2043 res->lockname.name);
2046 2044
2047 spin_lock(&dlm->work_lock); 2045 spin_lock(&dlm->work_lock);
2048 list_add_tail(&item->list, &dlm->work_list); 2046 list_add_tail(&item->list, &dlm->work_list);
2049 spin_unlock(&dlm->work_lock); 2047 spin_unlock(&dlm->work_lock);
@@ -2133,7 +2131,7 @@ put:
2133 * think that $RECOVERY is currently mastered by a dead node. If so, 2131 * think that $RECOVERY is currently mastered by a dead node. If so,
2134 * we wait a short time to allow that node to get notified by its own 2132 * we wait a short time to allow that node to get notified by its own
2135 * heartbeat stack, then check again. All $RECOVERY lock resources 2133 * heartbeat stack, then check again. All $RECOVERY lock resources
2136 * mastered by dead nodes are purged when the hearbeat callback is 2134 * mastered by dead nodes are purged when the hearbeat callback is
2137 * fired, so we can know for sure that it is safe to continue once 2135 * fired, so we can know for sure that it is safe to continue once
2138 * the node returns a live node or no node. */ 2136 * the node returns a live node or no node. */
2139static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, 2137static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
@@ -2174,7 +2172,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
2174 ret = -EAGAIN; 2172 ret = -EAGAIN;
2175 } 2173 }
2176 spin_unlock(&dlm->spinlock); 2174 spin_unlock(&dlm->spinlock);
2177 mlog(0, "%s: reco lock master is %u\n", dlm->name, 2175 mlog(0, "%s: reco lock master is %u\n", dlm->name,
2178 master); 2176 master);
2179 break; 2177 break;
2180 } 2178 }
@@ -2602,7 +2600,7 @@ fail:
2602 2600
2603 mlog(0, "%s:%.*s: timed out during migration\n", 2601 mlog(0, "%s:%.*s: timed out during migration\n",
2604 dlm->name, res->lockname.len, res->lockname.name); 2602 dlm->name, res->lockname.len, res->lockname.name);
2605 /* avoid hang during shutdown when migrating lockres 2603 /* avoid hang during shutdown when migrating lockres
2606 * to a node which also goes down */ 2604 * to a node which also goes down */
2607 if (dlm_is_node_dead(dlm, target)) { 2605 if (dlm_is_node_dead(dlm, target)) {
2608 mlog(0, "%s:%.*s: expected migration " 2606 mlog(0, "%s:%.*s: expected migration "
@@ -2738,7 +2736,7 @@ static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
2738 can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING); 2736 can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
2739 spin_unlock(&res->spinlock); 2737 spin_unlock(&res->spinlock);
2740 2738
2741 /* target has died, so make the caller break out of the 2739 /* target has died, so make the caller break out of the
2742 * wait_event, but caller must recheck the domain_map */ 2740 * wait_event, but caller must recheck the domain_map */
2743 spin_lock(&dlm->spinlock); 2741 spin_lock(&dlm->spinlock);
2744 if (!test_bit(mig_target, dlm->domain_map)) 2742 if (!test_bit(mig_target, dlm->domain_map))
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 2f9e4e19a4f2..b4f99de2caf3 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -310,7 +310,7 @@ static int dlm_recovery_thread(void *data)
310 mlog(0, "dlm thread running for %s...\n", dlm->name); 310 mlog(0, "dlm thread running for %s...\n", dlm->name);
311 311
312 while (!kthread_should_stop()) { 312 while (!kthread_should_stop()) {
313 if (dlm_joined(dlm)) { 313 if (dlm_domain_fully_joined(dlm)) {
314 status = dlm_do_recovery(dlm); 314 status = dlm_do_recovery(dlm);
315 if (status == -EAGAIN) { 315 if (status == -EAGAIN) {
316 /* do not sleep, recheck immediately. */ 316 /* do not sleep, recheck immediately. */
@@ -1050,7 +1050,7 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
1050 if (lock->ml.node == dead_node) { 1050 if (lock->ml.node == dead_node) {
1051 mlog(0, "AHA! there was " 1051 mlog(0, "AHA! there was "
1052 "a $RECOVERY lock for dead " 1052 "a $RECOVERY lock for dead "
1053 "node %u (%s)!\n", 1053 "node %u (%s)!\n",
1054 dead_node, dlm->name); 1054 dead_node, dlm->name);
1055 list_del_init(&lock->list); 1055 list_del_init(&lock->list);
1056 dlm_lock_put(lock); 1056 dlm_lock_put(lock);
@@ -1164,6 +1164,39 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
1164 mres->master = master; 1164 mres->master = master;
1165} 1165}
1166 1166
1167static void dlm_prepare_lvb_for_migration(struct dlm_lock *lock,
1168 struct dlm_migratable_lockres *mres,
1169 int queue)
1170{
1171 if (!lock->lksb)
1172 return;
1173
1174 /* Ignore lvb in all locks in the blocked list */
1175 if (queue == DLM_BLOCKED_LIST)
1176 return;
1177
1178 /* Only consider lvbs in locks with granted EX or PR lock levels */
1179 if (lock->ml.type != LKM_EXMODE && lock->ml.type != LKM_PRMODE)
1180 return;
1181
1182 if (dlm_lvb_is_empty(mres->lvb)) {
1183 memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
1184 return;
1185 }
1186
1187 /* Ensure the lvb copied for migration matches in other valid locks */
1188 if (!memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))
1189 return;
1190
1191 mlog(ML_ERROR, "Mismatched lvb in lock cookie=%u:%llu, name=%.*s, "
1192 "node=%u\n",
1193 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
1194 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
1195 lock->lockres->lockname.len, lock->lockres->lockname.name,
1196 lock->ml.node);
1197 dlm_print_one_lock_resource(lock->lockres);
1198 BUG();
1199}
1167 1200
1168/* returns 1 if this lock fills the network structure, 1201/* returns 1 if this lock fills the network structure,
1169 * 0 otherwise */ 1202 * 0 otherwise */
@@ -1181,20 +1214,7 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock,
1181 ml->list = queue; 1214 ml->list = queue;
1182 if (lock->lksb) { 1215 if (lock->lksb) {
1183 ml->flags = lock->lksb->flags; 1216 ml->flags = lock->lksb->flags;
1184 /* send our current lvb */ 1217 dlm_prepare_lvb_for_migration(lock, mres, queue);
1185 if (ml->type == LKM_EXMODE ||
1186 ml->type == LKM_PRMODE) {
1187 /* if it is already set, this had better be a PR
1188 * and it has to match */
1189 if (!dlm_lvb_is_empty(mres->lvb) &&
1190 (ml->type == LKM_EXMODE ||
1191 memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
1192 mlog(ML_ERROR, "mismatched lvbs!\n");
1193 dlm_print_one_lock_resource(lock->lockres);
1194 BUG();
1195 }
1196 memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
1197 }
1198 } 1218 }
1199 ml->node = lock->ml.node; 1219 ml->node = lock->ml.node;
1200 mres->num_locks++; 1220 mres->num_locks++;
@@ -1730,6 +1750,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1730 struct dlm_lock *lock = NULL; 1750 struct dlm_lock *lock = NULL;
1731 u8 from = O2NM_MAX_NODES; 1751 u8 from = O2NM_MAX_NODES;
1732 unsigned int added = 0; 1752 unsigned int added = 0;
1753 __be64 c;
1733 1754
1734 mlog(0, "running %d locks for this lockres\n", mres->num_locks); 1755 mlog(0, "running %d locks for this lockres\n", mres->num_locks);
1735 for (i=0; i<mres->num_locks; i++) { 1756 for (i=0; i<mres->num_locks; i++) {
@@ -1777,19 +1798,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1777 /* lock is always created locally first, and 1798 /* lock is always created locally first, and
1778 * destroyed locally last. it must be on the list */ 1799 * destroyed locally last. it must be on the list */
1779 if (!lock) { 1800 if (!lock) {
1780 __be64 c = ml->cookie; 1801 c = ml->cookie;
1781 mlog(ML_ERROR, "could not find local lock " 1802 mlog(ML_ERROR, "Could not find local lock "
1782 "with cookie %u:%llu!\n", 1803 "with cookie %u:%llu, node %u, "
1804 "list %u, flags 0x%x, type %d, "
1805 "conv %d, highest blocked %d\n",
1783 dlm_get_lock_cookie_node(be64_to_cpu(c)), 1806 dlm_get_lock_cookie_node(be64_to_cpu(c)),
1784 dlm_get_lock_cookie_seq(be64_to_cpu(c))); 1807 dlm_get_lock_cookie_seq(be64_to_cpu(c)),
1808 ml->node, ml->list, ml->flags, ml->type,
1809 ml->convert_type, ml->highest_blocked);
1810 __dlm_print_one_lock_resource(res);
1811 BUG();
1812 }
1813
1814 if (lock->ml.node != ml->node) {
1815 c = lock->ml.cookie;
1816 mlog(ML_ERROR, "Mismatched node# in lock "
1817 "cookie %u:%llu, name %.*s, node %u\n",
1818 dlm_get_lock_cookie_node(be64_to_cpu(c)),
1819 dlm_get_lock_cookie_seq(be64_to_cpu(c)),
1820 res->lockname.len, res->lockname.name,
1821 lock->ml.node);
1822 c = ml->cookie;
1823 mlog(ML_ERROR, "Migrate lock cookie %u:%llu, "
1824 "node %u, list %u, flags 0x%x, type %d, "
1825 "conv %d, highest blocked %d\n",
1826 dlm_get_lock_cookie_node(be64_to_cpu(c)),
1827 dlm_get_lock_cookie_seq(be64_to_cpu(c)),
1828 ml->node, ml->list, ml->flags, ml->type,
1829 ml->convert_type, ml->highest_blocked);
1785 __dlm_print_one_lock_resource(res); 1830 __dlm_print_one_lock_resource(res);
1786 BUG(); 1831 BUG();
1787 } 1832 }
1788 BUG_ON(lock->ml.node != ml->node);
1789 1833
1790 if (tmpq != queue) { 1834 if (tmpq != queue) {
1791 mlog(0, "lock was on %u instead of %u for %.*s\n", 1835 c = ml->cookie;
1792 j, ml->list, res->lockname.len, res->lockname.name); 1836 mlog(0, "Lock cookie %u:%llu was on list %u "
1837 "instead of list %u for %.*s\n",
1838 dlm_get_lock_cookie_node(be64_to_cpu(c)),
1839 dlm_get_lock_cookie_seq(be64_to_cpu(c)),
1840 j, ml->list, res->lockname.len,
1841 res->lockname.name);
1842 __dlm_print_one_lock_resource(res);
1793 spin_unlock(&res->spinlock); 1843 spin_unlock(&res->spinlock);
1794 continue; 1844 continue;
1795 } 1845 }
@@ -1839,7 +1889,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1839 * the lvb. */ 1889 * the lvb. */
1840 memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); 1890 memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
1841 } else { 1891 } else {
1842 /* otherwise, the node is sending its 1892 /* otherwise, the node is sending its
1843 * most recent valid lvb info */ 1893 * most recent valid lvb info */
1844 BUG_ON(ml->type != LKM_EXMODE && 1894 BUG_ON(ml->type != LKM_EXMODE &&
1845 ml->type != LKM_PRMODE); 1895 ml->type != LKM_PRMODE);
@@ -1886,7 +1936,7 @@ skip_lvb:
1886 spin_lock(&res->spinlock); 1936 spin_lock(&res->spinlock);
1887 list_for_each_entry(lock, queue, list) { 1937 list_for_each_entry(lock, queue, list) {
1888 if (lock->ml.cookie == ml->cookie) { 1938 if (lock->ml.cookie == ml->cookie) {
1889 __be64 c = lock->ml.cookie; 1939 c = lock->ml.cookie;
1890 mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " 1940 mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
1891 "exists on this lockres!\n", dlm->name, 1941 "exists on this lockres!\n", dlm->name,
1892 res->lockname.len, res->lockname.name, 1942 res->lockname.len, res->lockname.name,
@@ -2114,7 +2164,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
2114 assert_spin_locked(&res->spinlock); 2164 assert_spin_locked(&res->spinlock);
2115 2165
2116 if (res->owner == dlm->node_num) 2166 if (res->owner == dlm->node_num)
2117 /* if this node owned the lockres, and if the dead node 2167 /* if this node owned the lockres, and if the dead node
2118 * had an EX when he died, blank out the lvb */ 2168 * had an EX when he died, blank out the lvb */
2119 search_node = dead_node; 2169 search_node = dead_node;
2120 else { 2170 else {
@@ -2152,7 +2202,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2152 2202
2153 /* this node is the lockres master: 2203 /* this node is the lockres master:
2154 * 1) remove any stale locks for the dead node 2204 * 1) remove any stale locks for the dead node
2155 * 2) if the dead node had an EX when he died, blank out the lvb 2205 * 2) if the dead node had an EX when he died, blank out the lvb
2156 */ 2206 */
2157 assert_spin_locked(&dlm->spinlock); 2207 assert_spin_locked(&dlm->spinlock);
2158 assert_spin_locked(&res->spinlock); 2208 assert_spin_locked(&res->spinlock);
@@ -2193,7 +2243,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2193 mlog(0, "%s:%.*s: freed %u locks for dead node %u, " 2243 mlog(0, "%s:%.*s: freed %u locks for dead node %u, "
2194 "dropping ref from lockres\n", dlm->name, 2244 "dropping ref from lockres\n", dlm->name,
2195 res->lockname.len, res->lockname.name, freed, dead_node); 2245 res->lockname.len, res->lockname.name, freed, dead_node);
2196 BUG_ON(!test_bit(dead_node, res->refmap)); 2246 if(!test_bit(dead_node, res->refmap)) {
2247 mlog(ML_ERROR, "%s:%.*s: freed %u locks for dead node %u, "
2248 "but ref was not set\n", dlm->name,
2249 res->lockname.len, res->lockname.name, freed, dead_node);
2250 __dlm_print_one_lock_resource(res);
2251 }
2197 dlm_lockres_clear_refmap_bit(dead_node, res); 2252 dlm_lockres_clear_refmap_bit(dead_node, res);
2198 } else if (test_bit(dead_node, res->refmap)) { 2253 } else if (test_bit(dead_node, res->refmap)) {
2199 mlog(0, "%s:%.*s: dead node %u had a ref, but had " 2254 mlog(0, "%s:%.*s: dead node %u had a ref, but had "
@@ -2260,7 +2315,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2260 } 2315 }
2261 spin_unlock(&res->spinlock); 2316 spin_unlock(&res->spinlock);
2262 continue; 2317 continue;
2263 } 2318 }
2264 spin_lock(&res->spinlock); 2319 spin_lock(&res->spinlock);
2265 /* zero the lvb if necessary */ 2320 /* zero the lvb if necessary */
2266 dlm_revalidate_lvb(dlm, res, dead_node); 2321 dlm_revalidate_lvb(dlm, res, dead_node);
@@ -2411,7 +2466,7 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
2411 * this function on each node racing to become the recovery 2466 * this function on each node racing to become the recovery
2412 * master will not stop attempting this until either: 2467 * master will not stop attempting this until either:
2413 * a) this node gets the EX (and becomes the recovery master), 2468 * a) this node gets the EX (and becomes the recovery master),
2414 * or b) dlm->reco.new_master gets set to some nodenum 2469 * or b) dlm->reco.new_master gets set to some nodenum
2415 * != O2NM_INVALID_NODE_NUM (another node will do the reco). 2470 * != O2NM_INVALID_NODE_NUM (another node will do the reco).
2416 * so each time a recovery master is needed, the entire cluster 2471 * so each time a recovery master is needed, the entire cluster
2417 * will sync at this point. if the new master dies, that will 2472 * will sync at this point. if the new master dies, that will
@@ -2424,7 +2479,7 @@ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
2424 2479
2425 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", 2480 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
2426 dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); 2481 dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
2427again: 2482again:
2428 memset(&lksb, 0, sizeof(lksb)); 2483 memset(&lksb, 0, sizeof(lksb));
2429 2484
2430 ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, 2485 ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
@@ -2437,8 +2492,8 @@ again:
2437 if (ret == DLM_NORMAL) { 2492 if (ret == DLM_NORMAL) {
2438 mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", 2493 mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
2439 dlm->name, dlm->node_num); 2494 dlm->name, dlm->node_num);
2440 2495
2441 /* got the EX lock. check to see if another node 2496 /* got the EX lock. check to see if another node
2442 * just became the reco master */ 2497 * just became the reco master */
2443 if (dlm_reco_master_ready(dlm)) { 2498 if (dlm_reco_master_ready(dlm)) {
2444 mlog(0, "%s: got reco EX lock, but %u will " 2499 mlog(0, "%s: got reco EX lock, but %u will "
@@ -2451,12 +2506,12 @@ again:
2451 /* see if recovery was already finished elsewhere */ 2506 /* see if recovery was already finished elsewhere */
2452 spin_lock(&dlm->spinlock); 2507 spin_lock(&dlm->spinlock);
2453 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { 2508 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
2454 status = -EINVAL; 2509 status = -EINVAL;
2455 mlog(0, "%s: got reco EX lock, but " 2510 mlog(0, "%s: got reco EX lock, but "
2456 "node got recovered already\n", dlm->name); 2511 "node got recovered already\n", dlm->name);
2457 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { 2512 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
2458 mlog(ML_ERROR, "%s: new master is %u " 2513 mlog(ML_ERROR, "%s: new master is %u "
2459 "but no dead node!\n", 2514 "but no dead node!\n",
2460 dlm->name, dlm->reco.new_master); 2515 dlm->name, dlm->reco.new_master);
2461 BUG(); 2516 BUG();
2462 } 2517 }
@@ -2468,7 +2523,7 @@ again:
2468 * set the master and send the messages to begin recovery */ 2523 * set the master and send the messages to begin recovery */
2469 if (!status) { 2524 if (!status) {
2470 mlog(0, "%s: dead=%u, this=%u, sending " 2525 mlog(0, "%s: dead=%u, this=%u, sending "
2471 "begin_reco now\n", dlm->name, 2526 "begin_reco now\n", dlm->name,
2472 dlm->reco.dead_node, dlm->node_num); 2527 dlm->reco.dead_node, dlm->node_num);
2473 status = dlm_send_begin_reco_message(dlm, 2528 status = dlm_send_begin_reco_message(dlm,
2474 dlm->reco.dead_node); 2529 dlm->reco.dead_node);
@@ -2501,7 +2556,7 @@ again:
2501 mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", 2556 mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
2502 dlm->name, dlm->node_num); 2557 dlm->name, dlm->node_num);
2503 /* another node is master. wait on 2558 /* another node is master. wait on
2504 * reco.new_master != O2NM_INVALID_NODE_NUM 2559 * reco.new_master != O2NM_INVALID_NODE_NUM
2505 * for at most one second */ 2560 * for at most one second */
2506 wait_event_timeout(dlm->dlm_reco_thread_wq, 2561 wait_event_timeout(dlm->dlm_reco_thread_wq,
2507 dlm_reco_master_ready(dlm), 2562 dlm_reco_master_ready(dlm),
@@ -2589,7 +2644,13 @@ retry:
2589 "begin reco msg (%d)\n", dlm->name, nodenum, ret); 2644 "begin reco msg (%d)\n", dlm->name, nodenum, ret);
2590 ret = 0; 2645 ret = 0;
2591 } 2646 }
2592 if (ret == -EAGAIN) { 2647
2648 /*
2649 * Prior to commit aad1b15310b9bcd59fa81ab8f2b1513b59553ea8,
2650 * dlm_begin_reco_handler() returned EAGAIN and not -EAGAIN.
2651 * We are handling both for compatibility reasons.
2652 */
2653 if (ret == -EAGAIN || ret == EAGAIN) {
2593 mlog(0, "%s: trying to start recovery of node " 2654 mlog(0, "%s: trying to start recovery of node "
2594 "%u, but node %u is waiting for last recovery " 2655 "%u, but node %u is waiting for last recovery "
2595 "to complete, backoff for a bit\n", dlm->name, 2656 "to complete, backoff for a bit\n", dlm->name,
@@ -2599,7 +2660,7 @@ retry:
2599 } 2660 }
2600 if (ret < 0) { 2661 if (ret < 0) {
2601 struct dlm_lock_resource *res; 2662 struct dlm_lock_resource *res;
2602 /* this is now a serious problem, possibly ENOMEM 2663 /* this is now a serious problem, possibly ENOMEM
2603 * in the network stack. must retry */ 2664 * in the network stack. must retry */
2604 mlog_errno(ret); 2665 mlog_errno(ret);
2605 mlog(ML_ERROR, "begin reco of dlm %s to node %u " 2666 mlog(ML_ERROR, "begin reco of dlm %s to node %u "
@@ -2612,7 +2673,7 @@ retry:
2612 } else { 2673 } else {
2613 mlog(ML_ERROR, "recovery lock not found\n"); 2674 mlog(ML_ERROR, "recovery lock not found\n");
2614 } 2675 }
2615 /* sleep for a bit in hopes that we can avoid 2676 /* sleep for a bit in hopes that we can avoid
2616 * another ENOMEM */ 2677 * another ENOMEM */
2617 msleep(100); 2678 msleep(100);
2618 goto retry; 2679 goto retry;
@@ -2664,7 +2725,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2664 } 2725 }
2665 if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { 2726 if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
2666 mlog(ML_NOTICE, "%s: dead_node previously set to %u, " 2727 mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
2667 "node %u changing it to %u\n", dlm->name, 2728 "node %u changing it to %u\n", dlm->name,
2668 dlm->reco.dead_node, br->node_idx, br->dead_node); 2729 dlm->reco.dead_node, br->node_idx, br->dead_node);
2669 } 2730 }
2670 dlm_set_reco_master(dlm, br->node_idx); 2731 dlm_set_reco_master(dlm, br->node_idx);
@@ -2730,8 +2791,8 @@ stage2:
2730 if (ret < 0) { 2791 if (ret < 0) {
2731 mlog_errno(ret); 2792 mlog_errno(ret);
2732 if (dlm_is_host_down(ret)) { 2793 if (dlm_is_host_down(ret)) {
2733 /* this has no effect on this recovery 2794 /* this has no effect on this recovery
2734 * session, so set the status to zero to 2795 * session, so set the status to zero to
2735 * finish out the last recovery */ 2796 * finish out the last recovery */
2736 mlog(ML_ERROR, "node %u went down after this " 2797 mlog(ML_ERROR, "node %u went down after this "
2737 "node finished recovery.\n", nodenum); 2798 "node finished recovery.\n", nodenum);
@@ -2768,7 +2829,7 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2768 mlog(0, "%s: node %u finalizing recovery stage%d of " 2829 mlog(0, "%s: node %u finalizing recovery stage%d of "
2769 "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage, 2830 "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
2770 fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master); 2831 fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
2771 2832
2772 spin_lock(&dlm->spinlock); 2833 spin_lock(&dlm->spinlock);
2773 2834
2774 if (dlm->reco.new_master != fr->node_idx) { 2835 if (dlm->reco.new_master != fr->node_idx) {
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 52ec020ea78b..11a6d1fd1d35 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 00f53b2aea76..b47c1b92b82b 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -28,7 +28,6 @@
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h> 31#include <linux/highmem.h>
33#include <linux/init.h> 32#include <linux/init.h>
34#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -190,8 +189,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
190 actions &= ~(DLM_UNLOCK_REMOVE_LOCK| 189 actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
191 DLM_UNLOCK_REGRANT_LOCK| 190 DLM_UNLOCK_REGRANT_LOCK|
192 DLM_UNLOCK_CLEAR_CONVERT_TYPE); 191 DLM_UNLOCK_CLEAR_CONVERT_TYPE);
193 } else if (status == DLM_RECOVERING || 192 } else if (status == DLM_RECOVERING ||
194 status == DLM_MIGRATING || 193 status == DLM_MIGRATING ||
195 status == DLM_FORWARD) { 194 status == DLM_FORWARD) {
196 /* must clear the actions because this unlock 195 /* must clear the actions because this unlock
197 * is about to be retried. cannot free or do 196 * is about to be retried. cannot free or do
@@ -661,14 +660,14 @@ retry:
661 if (call_ast) { 660 if (call_ast) {
662 mlog(0, "calling unlockast(%p, %d)\n", data, status); 661 mlog(0, "calling unlockast(%p, %d)\n", data, status);
663 if (is_master) { 662 if (is_master) {
664 /* it is possible that there is one last bast 663 /* it is possible that there is one last bast
665 * pending. make sure it is flushed, then 664 * pending. make sure it is flushed, then
666 * call the unlockast. 665 * call the unlockast.
667 * not an issue if this is a mastered remotely, 666 * not an issue if this is a mastered remotely,
668 * since this lock has been removed from the 667 * since this lock has been removed from the
669 * lockres queues and cannot be found. */ 668 * lockres queues and cannot be found. */
670 dlm_kick_thread(dlm, NULL); 669 dlm_kick_thread(dlm, NULL);
671 wait_event(dlm->ast_wq, 670 wait_event(dlm->ast_wq,
672 dlm_lock_basts_flushed(dlm, lock)); 671 dlm_lock_basts_flushed(dlm, lock));
673 } 672 }
674 (*unlockast)(data, status); 673 (*unlockast)(data, status);
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
new file mode 100644
index 000000000000..df69b4856d0d
--- /dev/null
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -0,0 +1,5 @@
1EXTRA_CFLAGS += -Ifs/ocfs2
2
3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
4
5ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 02bf17808bdc..1b0de157a08c 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -43,24 +43,17 @@
43#include <linux/init.h> 43#include <linux/init.h>
44#include <linux/string.h> 44#include <linux/string.h>
45#include <linux/backing-dev.h> 45#include <linux/backing-dev.h>
46#include <linux/poll.h>
46 47
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
48 49
49 50#include "stackglue.h"
50#include "cluster/nodemanager.h"
51#include "cluster/heartbeat.h"
52#include "cluster/tcp.h"
53
54#include "dlmapi.h"
55
56#include "userdlm.h" 51#include "userdlm.h"
57
58#include "dlmfsver.h" 52#include "dlmfsver.h"
59 53
60#define MLOG_MASK_PREFIX ML_DLMFS 54#define MLOG_MASK_PREFIX ML_DLMFS
61#include "cluster/masklog.h" 55#include "cluster/masklog.h"
62 56
63#include "ocfs2_lockingver.h"
64 57
65static const struct super_operations dlmfs_ops; 58static const struct super_operations dlmfs_ops;
66static const struct file_operations dlmfs_file_operations; 59static const struct file_operations dlmfs_file_operations;
@@ -71,15 +64,46 @@ static struct kmem_cache *dlmfs_inode_cache;
71 64
72struct workqueue_struct *user_dlm_worker; 65struct workqueue_struct *user_dlm_worker;
73 66
67
68
74/* 69/*
75 * This is the userdlmfs locking protocol version. 70 * These are the ABI capabilities of dlmfs.
71 *
72 * Over time, dlmfs has added some features that were not part of the
73 * initial ABI. Unfortunately, some of these features are not detectable
74 * via standard usage. For example, Linux's default poll always returns
75 * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
76 * added poll support. Instead, we provide this list of new capabilities.
77 *
78 * Capabilities is a read-only attribute. We do it as a module parameter
79 * so we can discover it whether dlmfs is built in, loaded, or even not
80 * loaded.
76 * 81 *
77 * See fs/ocfs2/dlmglue.c for more details on locking versions. 82 * The ABI features are local to this machine's dlmfs mount. This is
83 * distinct from the locking protocol, which is concerned with inter-node
84 * interaction.
85 *
86 * Capabilities:
87 * - bast : POLLIN against the file descriptor of a held lock
88 * signifies a bast fired on the lock.
78 */ 89 */
79static const struct dlm_protocol_version user_locking_protocol = { 90#define DLMFS_CAPABILITIES "bast stackglue"
80 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, 91extern int param_set_dlmfs_capabilities(const char *val,
81 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, 92 struct kernel_param *kp)
82}; 93{
94 printk(KERN_ERR "%s: readonly parameter\n", kp->name);
95 return -EINVAL;
96}
97static int param_get_dlmfs_capabilities(char *buffer,
98 struct kernel_param *kp)
99{
100 return strlcpy(buffer, DLMFS_CAPABILITIES,
101 strlen(DLMFS_CAPABILITIES) + 1);
102}
103module_param_call(capabilities, param_set_dlmfs_capabilities,
104 param_get_dlmfs_capabilities, NULL, 0444);
105MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
106
83 107
84/* 108/*
85 * decodes a set of open flags into a valid lock level and a set of flags. 109 * decodes a set of open flags into a valid lock level and a set of flags.
@@ -179,13 +203,46 @@ static int dlmfs_file_release(struct inode *inode,
179 return 0; 203 return 0;
180} 204}
181 205
206/*
207 * We do ->setattr() just to override size changes. Our size is the size
208 * of the LVB and nothing else.
209 */
210static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
211{
212 int error;
213 struct inode *inode = dentry->d_inode;
214
215 attr->ia_valid &= ~ATTR_SIZE;
216 error = inode_change_ok(inode, attr);
217 if (!error)
218 error = inode_setattr(inode, attr);
219
220 return error;
221}
222
223static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
224{
225 int event = 0;
226 struct inode *inode = file->f_path.dentry->d_inode;
227 struct dlmfs_inode_private *ip = DLMFS_I(inode);
228
229 poll_wait(file, &ip->ip_lockres.l_event, wait);
230
231 spin_lock(&ip->ip_lockres.l_lock);
232 if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
233 event = POLLIN | POLLRDNORM;
234 spin_unlock(&ip->ip_lockres.l_lock);
235
236 return event;
237}
238
182static ssize_t dlmfs_file_read(struct file *filp, 239static ssize_t dlmfs_file_read(struct file *filp,
183 char __user *buf, 240 char __user *buf,
184 size_t count, 241 size_t count,
185 loff_t *ppos) 242 loff_t *ppos)
186{ 243{
187 int bytes_left; 244 int bytes_left;
188 ssize_t readlen; 245 ssize_t readlen, got;
189 char *lvb_buf; 246 char *lvb_buf;
190 struct inode *inode = filp->f_path.dentry->d_inode; 247 struct inode *inode = filp->f_path.dentry->d_inode;
191 248
@@ -211,9 +268,13 @@ static ssize_t dlmfs_file_read(struct file *filp,
211 if (!lvb_buf) 268 if (!lvb_buf)
212 return -ENOMEM; 269 return -ENOMEM;
213 270
214 user_dlm_read_lvb(inode, lvb_buf, readlen); 271 got = user_dlm_read_lvb(inode, lvb_buf, readlen);
215 bytes_left = __copy_to_user(buf, lvb_buf, readlen); 272 if (got) {
216 readlen -= bytes_left; 273 BUG_ON(got != readlen);
274 bytes_left = __copy_to_user(buf, lvb_buf, readlen);
275 readlen -= bytes_left;
276 } else
277 readlen = 0;
217 278
218 kfree(lvb_buf); 279 kfree(lvb_buf);
219 280
@@ -272,7 +333,7 @@ static void dlmfs_init_once(void *foo)
272 struct dlmfs_inode_private *ip = 333 struct dlmfs_inode_private *ip =
273 (struct dlmfs_inode_private *) foo; 334 (struct dlmfs_inode_private *) foo;
274 335
275 ip->ip_dlm = NULL; 336 ip->ip_conn = NULL;
276 ip->ip_parent = NULL; 337 ip->ip_parent = NULL;
277 338
278 inode_init_once(&ip->ip_vfs_inode); 339 inode_init_once(&ip->ip_vfs_inode);
@@ -314,14 +375,14 @@ static void dlmfs_clear_inode(struct inode *inode)
314 goto clear_fields; 375 goto clear_fields;
315 } 376 }
316 377
317 mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm); 378 mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn);
318 /* we must be a directory. If required, lets unregister the 379 /* we must be a directory. If required, lets unregister the
319 * dlm context now. */ 380 * dlm context now. */
320 if (ip->ip_dlm) 381 if (ip->ip_conn)
321 user_dlm_unregister_context(ip->ip_dlm); 382 user_dlm_unregister(ip->ip_conn);
322clear_fields: 383clear_fields:
323 ip->ip_parent = NULL; 384 ip->ip_parent = NULL;
324 ip->ip_dlm = NULL; 385 ip->ip_conn = NULL;
325} 386}
326 387
327static struct backing_dev_info dlmfs_backing_dev_info = { 388static struct backing_dev_info dlmfs_backing_dev_info = {
@@ -371,7 +432,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
371 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 432 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
372 433
373 ip = DLMFS_I(inode); 434 ip = DLMFS_I(inode);
374 ip->ip_dlm = DLMFS_I(parent)->ip_dlm; 435 ip->ip_conn = DLMFS_I(parent)->ip_conn;
375 436
376 switch (mode & S_IFMT) { 437 switch (mode & S_IFMT) {
377 default: 438 default:
@@ -425,13 +486,12 @@ static int dlmfs_mkdir(struct inode * dir,
425 struct inode *inode = NULL; 486 struct inode *inode = NULL;
426 struct qstr *domain = &dentry->d_name; 487 struct qstr *domain = &dentry->d_name;
427 struct dlmfs_inode_private *ip; 488 struct dlmfs_inode_private *ip;
428 struct dlm_ctxt *dlm; 489 struct ocfs2_cluster_connection *conn;
429 struct dlm_protocol_version proto = user_locking_protocol;
430 490
431 mlog(0, "mkdir %.*s\n", domain->len, domain->name); 491 mlog(0, "mkdir %.*s\n", domain->len, domain->name);
432 492
433 /* verify that we have a proper domain */ 493 /* verify that we have a proper domain */
434 if (domain->len >= O2NM_MAX_NAME_LEN) { 494 if (domain->len >= GROUP_NAME_MAX) {
435 status = -EINVAL; 495 status = -EINVAL;
436 mlog(ML_ERROR, "invalid domain name for directory.\n"); 496 mlog(ML_ERROR, "invalid domain name for directory.\n");
437 goto bail; 497 goto bail;
@@ -446,14 +506,14 @@ static int dlmfs_mkdir(struct inode * dir,
446 506
447 ip = DLMFS_I(inode); 507 ip = DLMFS_I(inode);
448 508
449 dlm = user_dlm_register_context(domain, &proto); 509 conn = user_dlm_register(domain);
450 if (IS_ERR(dlm)) { 510 if (IS_ERR(conn)) {
451 status = PTR_ERR(dlm); 511 status = PTR_ERR(conn);
452 mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n", 512 mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
453 status, domain->len, domain->name); 513 status, domain->len, domain->name);
454 goto bail; 514 goto bail;
455 } 515 }
456 ip->ip_dlm = dlm; 516 ip->ip_conn = conn;
457 517
458 inc_nlink(dir); 518 inc_nlink(dir);
459 d_instantiate(dentry, inode); 519 d_instantiate(dentry, inode);
@@ -549,6 +609,7 @@ static int dlmfs_fill_super(struct super_block * sb,
549static const struct file_operations dlmfs_file_operations = { 609static const struct file_operations dlmfs_file_operations = {
550 .open = dlmfs_file_open, 610 .open = dlmfs_file_open,
551 .release = dlmfs_file_release, 611 .release = dlmfs_file_release,
612 .poll = dlmfs_file_poll,
552 .read = dlmfs_file_read, 613 .read = dlmfs_file_read,
553 .write = dlmfs_file_write, 614 .write = dlmfs_file_write,
554}; 615};
@@ -576,6 +637,7 @@ static const struct super_operations dlmfs_ops = {
576 637
577static const struct inode_operations dlmfs_file_inode_operations = { 638static const struct inode_operations dlmfs_file_inode_operations = {
578 .getattr = simple_getattr, 639 .getattr = simple_getattr,
640 .setattr = dlmfs_file_setattr,
579}; 641};
580 642
581static int dlmfs_get_sb(struct file_system_type *fs_type, 643static int dlmfs_get_sb(struct file_system_type *fs_type,
@@ -620,6 +682,7 @@ static int __init init_dlmfs_fs(void)
620 } 682 }
621 cleanup_worker = 1; 683 cleanup_worker = 1;
622 684
685 user_dlm_set_locking_protocol();
623 status = register_filesystem(&dlmfs_fs_type); 686 status = register_filesystem(&dlmfs_fs_type);
624bail: 687bail:
625 if (status) { 688 if (status) {
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
index a733b3321f83..a733b3321f83 100644
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ b/fs/ocfs2/dlmfs/dlmfsver.c
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
index f35eadbed25c..f35eadbed25c 100644
--- a/fs/ocfs2/dlm/dlmfsver.h
+++ b/fs/ocfs2/dlmfs/dlmfsver.h
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 4cb1d3dae250..0499e3fb7bdb 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -34,18 +34,19 @@
34#include <linux/types.h> 34#include <linux/types.h>
35#include <linux/crc32.h> 35#include <linux/crc32.h>
36 36
37 37#include "ocfs2_lockingver.h"
38#include "cluster/nodemanager.h" 38#include "stackglue.h"
39#include "cluster/heartbeat.h"
40#include "cluster/tcp.h"
41
42#include "dlmapi.h"
43
44#include "userdlm.h" 39#include "userdlm.h"
45 40
46#define MLOG_MASK_PREFIX ML_DLMFS 41#define MLOG_MASK_PREFIX ML_DLMFS
47#include "cluster/masklog.h" 42#include "cluster/masklog.h"
48 43
44
45static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
46{
47 return container_of(lksb, struct user_lock_res, l_lksb);
48}
49
49static inline int user_check_wait_flag(struct user_lock_res *lockres, 50static inline int user_check_wait_flag(struct user_lock_res *lockres,
50 int flag) 51 int flag)
51{ 52{
@@ -73,15 +74,15 @@ static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
73} 74}
74 75
75/* I heart container_of... */ 76/* I heart container_of... */
76static inline struct dlm_ctxt * 77static inline struct ocfs2_cluster_connection *
77dlm_ctxt_from_user_lockres(struct user_lock_res *lockres) 78cluster_connection_from_user_lockres(struct user_lock_res *lockres)
78{ 79{
79 struct dlmfs_inode_private *ip; 80 struct dlmfs_inode_private *ip;
80 81
81 ip = container_of(lockres, 82 ip = container_of(lockres,
82 struct dlmfs_inode_private, 83 struct dlmfs_inode_private,
83 ip_lockres); 84 ip_lockres);
84 return ip->ip_dlm; 85 return ip->ip_conn;
85} 86}
86 87
87static struct inode * 88static struct inode *
@@ -103,9 +104,9 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
103} 104}
104 105
105#define user_log_dlm_error(_func, _stat, _lockres) do { \ 106#define user_log_dlm_error(_func, _stat, _lockres) do { \
106 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ 107 mlog(ML_ERROR, "Dlm error %d while calling %s on " \
107 "resource %.*s: %s\n", dlm_errname(_stat), _func, \ 108 "resource %.*s\n", _stat, _func, \
108 _lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \ 109 _lockres->l_namelen, _lockres->l_name); \
109} while (0) 110} while (0)
110 111
111/* WARNING: This function lives in a world where the only three lock 112/* WARNING: This function lives in a world where the only three lock
@@ -113,34 +114,35 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
113 * lock types are added. */ 114 * lock types are added. */
114static inline int user_highest_compat_lock_level(int level) 115static inline int user_highest_compat_lock_level(int level)
115{ 116{
116 int new_level = LKM_EXMODE; 117 int new_level = DLM_LOCK_EX;
117 118
118 if (level == LKM_EXMODE) 119 if (level == DLM_LOCK_EX)
119 new_level = LKM_NLMODE; 120 new_level = DLM_LOCK_NL;
120 else if (level == LKM_PRMODE) 121 else if (level == DLM_LOCK_PR)
121 new_level = LKM_PRMODE; 122 new_level = DLM_LOCK_PR;
122 return new_level; 123 return new_level;
123} 124}
124 125
125static void user_ast(void *opaque) 126static void user_ast(struct ocfs2_dlm_lksb *lksb)
126{ 127{
127 struct user_lock_res *lockres = opaque; 128 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
128 struct dlm_lockstatus *lksb; 129 int status;
129 130
130 mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen, 131 mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n",
131 lockres->l_name); 132 lockres->l_namelen, lockres->l_name, lockres->l_level,
133 lockres->l_requested);
132 134
133 spin_lock(&lockres->l_lock); 135 spin_lock(&lockres->l_lock);
134 136
135 lksb = &(lockres->l_lksb); 137 status = ocfs2_dlm_lock_status(&lockres->l_lksb);
136 if (lksb->status != DLM_NORMAL) { 138 if (status) {
137 mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", 139 mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
138 lksb->status, lockres->l_namelen, lockres->l_name); 140 status, lockres->l_namelen, lockres->l_name);
139 spin_unlock(&lockres->l_lock); 141 spin_unlock(&lockres->l_lock);
140 return; 142 return;
141 } 143 }
142 144
143 mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE, 145 mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV,
144 "Lockres %.*s, requested ivmode. flags 0x%x\n", 146 "Lockres %.*s, requested ivmode. flags 0x%x\n",
145 lockres->l_namelen, lockres->l_name, lockres->l_flags); 147 lockres->l_namelen, lockres->l_name, lockres->l_flags);
146 148
@@ -148,13 +150,13 @@ static void user_ast(void *opaque)
148 if (lockres->l_requested < lockres->l_level) { 150 if (lockres->l_requested < lockres->l_level) {
149 if (lockres->l_requested <= 151 if (lockres->l_requested <=
150 user_highest_compat_lock_level(lockres->l_blocking)) { 152 user_highest_compat_lock_level(lockres->l_blocking)) {
151 lockres->l_blocking = LKM_NLMODE; 153 lockres->l_blocking = DLM_LOCK_NL;
152 lockres->l_flags &= ~USER_LOCK_BLOCKED; 154 lockres->l_flags &= ~USER_LOCK_BLOCKED;
153 } 155 }
154 } 156 }
155 157
156 lockres->l_level = lockres->l_requested; 158 lockres->l_level = lockres->l_requested;
157 lockres->l_requested = LKM_IVMODE; 159 lockres->l_requested = DLM_LOCK_IV;
158 lockres->l_flags |= USER_LOCK_ATTACHED; 160 lockres->l_flags |= USER_LOCK_ATTACHED;
159 lockres->l_flags &= ~USER_LOCK_BUSY; 161 lockres->l_flags &= ~USER_LOCK_BUSY;
160 162
@@ -193,11 +195,11 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
193 return; 195 return;
194 196
195 switch (lockres->l_blocking) { 197 switch (lockres->l_blocking) {
196 case LKM_EXMODE: 198 case DLM_LOCK_EX:
197 if (!lockres->l_ex_holders && !lockres->l_ro_holders) 199 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
198 queue = 1; 200 queue = 1;
199 break; 201 break;
200 case LKM_PRMODE: 202 case DLM_LOCK_PR:
201 if (!lockres->l_ex_holders) 203 if (!lockres->l_ex_holders)
202 queue = 1; 204 queue = 1;
203 break; 205 break;
@@ -209,12 +211,12 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
209 __user_dlm_queue_lockres(lockres); 211 __user_dlm_queue_lockres(lockres);
210} 212}
211 213
212static void user_bast(void *opaque, int level) 214static void user_bast(struct ocfs2_dlm_lksb *lksb, int level)
213{ 215{
214 struct user_lock_res *lockres = opaque; 216 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
215 217
216 mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n", 218 mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n",
217 lockres->l_namelen, lockres->l_name, level); 219 lockres->l_namelen, lockres->l_name, level, lockres->l_level);
218 220
219 spin_lock(&lockres->l_lock); 221 spin_lock(&lockres->l_lock);
220 lockres->l_flags |= USER_LOCK_BLOCKED; 222 lockres->l_flags |= USER_LOCK_BLOCKED;
@@ -227,15 +229,15 @@ static void user_bast(void *opaque, int level)
227 wake_up(&lockres->l_event); 229 wake_up(&lockres->l_event);
228} 230}
229 231
230static void user_unlock_ast(void *opaque, enum dlm_status status) 232static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status)
231{ 233{
232 struct user_lock_res *lockres = opaque; 234 struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
233 235
234 mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen, 236 mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n",
235 lockres->l_name); 237 lockres->l_namelen, lockres->l_name, lockres->l_flags);
236 238
237 if (status != DLM_NORMAL && status != DLM_CANCELGRANT) 239 if (status)
238 mlog(ML_ERROR, "Dlm returns status %d\n", status); 240 mlog(ML_ERROR, "dlm returns status %d\n", status);
239 241
240 spin_lock(&lockres->l_lock); 242 spin_lock(&lockres->l_lock);
241 /* The teardown flag gets set early during the unlock process, 243 /* The teardown flag gets set early during the unlock process,
@@ -243,7 +245,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
243 * for a concurrent cancel. */ 245 * for a concurrent cancel. */
244 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN 246 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
245 && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { 247 && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
246 lockres->l_level = LKM_IVMODE; 248 lockres->l_level = DLM_LOCK_IV;
247 } else if (status == DLM_CANCELGRANT) { 249 } else if (status == DLM_CANCELGRANT) {
248 /* We tried to cancel a convert request, but it was 250 /* We tried to cancel a convert request, but it was
249 * already granted. Don't clear the busy flag - the 251 * already granted. Don't clear the busy flag - the
@@ -254,7 +256,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
254 } else { 256 } else {
255 BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); 257 BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
256 /* Cancel succeeded, we want to re-queue */ 258 /* Cancel succeeded, we want to re-queue */
257 lockres->l_requested = LKM_IVMODE; /* cancel an 259 lockres->l_requested = DLM_LOCK_IV; /* cancel an
258 * upconvert 260 * upconvert
259 * request. */ 261 * request. */
260 lockres->l_flags &= ~USER_LOCK_IN_CANCEL; 262 lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
@@ -271,6 +273,21 @@ out_noclear:
271 wake_up(&lockres->l_event); 273 wake_up(&lockres->l_event);
272} 274}
273 275
276/*
277 * This is the userdlmfs locking protocol version.
278 *
279 * See fs/ocfs2/dlmglue.c for more details on locking versions.
280 */
281static struct ocfs2_locking_protocol user_dlm_lproto = {
282 .lp_max_version = {
283 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
284 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
285 },
286 .lp_lock_ast = user_ast,
287 .lp_blocking_ast = user_bast,
288 .lp_unlock_ast = user_unlock_ast,
289};
290
274static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres) 291static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
275{ 292{
276 struct inode *inode; 293 struct inode *inode;
@@ -283,10 +300,10 @@ static void user_dlm_unblock_lock(struct work_struct *work)
283 int new_level, status; 300 int new_level, status;
284 struct user_lock_res *lockres = 301 struct user_lock_res *lockres =
285 container_of(work, struct user_lock_res, l_work); 302 container_of(work, struct user_lock_res, l_work);
286 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 303 struct ocfs2_cluster_connection *conn =
304 cluster_connection_from_user_lockres(lockres);
287 305
288 mlog(0, "processing lockres %.*s\n", lockres->l_namelen, 306 mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
289 lockres->l_name);
290 307
291 spin_lock(&lockres->l_lock); 308 spin_lock(&lockres->l_lock);
292 309
@@ -304,17 +321,23 @@ static void user_dlm_unblock_lock(struct work_struct *work)
304 * flag, and finally we might get another bast which re-queues 321 * flag, and finally we might get another bast which re-queues
305 * us before our ast for the downconvert is called. */ 322 * us before our ast for the downconvert is called. */
306 if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { 323 if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
324 mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n",
325 lockres->l_namelen, lockres->l_name);
307 spin_unlock(&lockres->l_lock); 326 spin_unlock(&lockres->l_lock);
308 goto drop_ref; 327 goto drop_ref;
309 } 328 }
310 329
311 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { 330 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
331 mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n",
332 lockres->l_namelen, lockres->l_name);
312 spin_unlock(&lockres->l_lock); 333 spin_unlock(&lockres->l_lock);
313 goto drop_ref; 334 goto drop_ref;
314 } 335 }
315 336
316 if (lockres->l_flags & USER_LOCK_BUSY) { 337 if (lockres->l_flags & USER_LOCK_BUSY) {
317 if (lockres->l_flags & USER_LOCK_IN_CANCEL) { 338 if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
339 mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n",
340 lockres->l_namelen, lockres->l_name);
318 spin_unlock(&lockres->l_lock); 341 spin_unlock(&lockres->l_lock);
319 goto drop_ref; 342 goto drop_ref;
320 } 343 }
@@ -322,32 +345,31 @@ static void user_dlm_unblock_lock(struct work_struct *work)
322 lockres->l_flags |= USER_LOCK_IN_CANCEL; 345 lockres->l_flags |= USER_LOCK_IN_CANCEL;
323 spin_unlock(&lockres->l_lock); 346 spin_unlock(&lockres->l_lock);
324 347
325 status = dlmunlock(dlm, 348 status = ocfs2_dlm_unlock(conn, &lockres->l_lksb,
326 &lockres->l_lksb, 349 DLM_LKF_CANCEL);
327 LKM_CANCEL, 350 if (status)
328 user_unlock_ast, 351 user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
329 lockres);
330 if (status != DLM_NORMAL)
331 user_log_dlm_error("dlmunlock", status, lockres);
332 goto drop_ref; 352 goto drop_ref;
333 } 353 }
334 354
335 /* If there are still incompat holders, we can exit safely 355 /* If there are still incompat holders, we can exit safely
336 * without worrying about re-queueing this lock as that will 356 * without worrying about re-queueing this lock as that will
337 * happen on the last call to user_cluster_unlock. */ 357 * happen on the last call to user_cluster_unlock. */
338 if ((lockres->l_blocking == LKM_EXMODE) 358 if ((lockres->l_blocking == DLM_LOCK_EX)
339 && (lockres->l_ex_holders || lockres->l_ro_holders)) { 359 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
340 spin_unlock(&lockres->l_lock); 360 spin_unlock(&lockres->l_lock);
341 mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n", 361 mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n",
342 lockres->l_ro_holders, lockres->l_ex_holders); 362 lockres->l_namelen, lockres->l_name,
363 lockres->l_ex_holders, lockres->l_ro_holders);
343 goto drop_ref; 364 goto drop_ref;
344 } 365 }
345 366
346 if ((lockres->l_blocking == LKM_PRMODE) 367 if ((lockres->l_blocking == DLM_LOCK_PR)
347 && lockres->l_ex_holders) { 368 && lockres->l_ex_holders) {
348 spin_unlock(&lockres->l_lock); 369 spin_unlock(&lockres->l_lock);
349 mlog(0, "can't downconvert for pr: ex = %u\n", 370 mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n",
350 lockres->l_ex_holders); 371 lockres->l_namelen, lockres->l_name,
372 lockres->l_ex_holders);
351 goto drop_ref; 373 goto drop_ref;
352 } 374 }
353 375
@@ -355,22 +377,17 @@ static void user_dlm_unblock_lock(struct work_struct *work)
355 new_level = user_highest_compat_lock_level(lockres->l_blocking); 377 new_level = user_highest_compat_lock_level(lockres->l_blocking);
356 lockres->l_requested = new_level; 378 lockres->l_requested = new_level;
357 lockres->l_flags |= USER_LOCK_BUSY; 379 lockres->l_flags |= USER_LOCK_BUSY;
358 mlog(0, "Downconvert lock from %d to %d\n", 380 mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n",
359 lockres->l_level, new_level); 381 lockres->l_namelen, lockres->l_name, lockres->l_level, new_level);
360 spin_unlock(&lockres->l_lock); 382 spin_unlock(&lockres->l_lock);
361 383
362 /* need lock downconvert request now... */ 384 /* need lock downconvert request now... */
363 status = dlmlock(dlm, 385 status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb,
364 new_level, 386 DLM_LKF_CONVERT|DLM_LKF_VALBLK,
365 &lockres->l_lksb, 387 lockres->l_name,
366 LKM_CONVERT|LKM_VALBLK, 388 lockres->l_namelen);
367 lockres->l_name, 389 if (status) {
368 lockres->l_namelen, 390 user_log_dlm_error("ocfs2_dlm_lock", status, lockres);
369 user_ast,
370 lockres,
371 user_bast);
372 if (status != DLM_NORMAL) {
373 user_log_dlm_error("dlmlock", status, lockres);
374 user_recover_from_dlm_error(lockres); 391 user_recover_from_dlm_error(lockres);
375 } 392 }
376 393
@@ -382,10 +399,10 @@ static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
382 int level) 399 int level)
383{ 400{
384 switch(level) { 401 switch(level) {
385 case LKM_EXMODE: 402 case DLM_LOCK_EX:
386 lockres->l_ex_holders++; 403 lockres->l_ex_holders++;
387 break; 404 break;
388 case LKM_PRMODE: 405 case DLM_LOCK_PR:
389 lockres->l_ro_holders++; 406 lockres->l_ro_holders++;
390 break; 407 break;
391 default: 408 default:
@@ -410,20 +427,19 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres,
410 int lkm_flags) 427 int lkm_flags)
411{ 428{
412 int status, local_flags; 429 int status, local_flags;
413 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 430 struct ocfs2_cluster_connection *conn =
431 cluster_connection_from_user_lockres(lockres);
414 432
415 if (level != LKM_EXMODE && 433 if (level != DLM_LOCK_EX &&
416 level != LKM_PRMODE) { 434 level != DLM_LOCK_PR) {
417 mlog(ML_ERROR, "lockres %.*s: invalid request!\n", 435 mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
418 lockres->l_namelen, lockres->l_name); 436 lockres->l_namelen, lockres->l_name);
419 status = -EINVAL; 437 status = -EINVAL;
420 goto bail; 438 goto bail;
421 } 439 }
422 440
423 mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n", 441 mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n",
424 lockres->l_namelen, lockres->l_name, 442 lockres->l_namelen, lockres->l_name, level, lkm_flags);
425 (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
426 lkm_flags);
427 443
428again: 444again:
429 if (signal_pending(current)) { 445 if (signal_pending(current)) {
@@ -457,35 +473,26 @@ again:
457 } 473 }
458 474
459 if (level > lockres->l_level) { 475 if (level > lockres->l_level) {
460 local_flags = lkm_flags | LKM_VALBLK; 476 local_flags = lkm_flags | DLM_LKF_VALBLK;
461 if (lockres->l_level != LKM_IVMODE) 477 if (lockres->l_level != DLM_LOCK_IV)
462 local_flags |= LKM_CONVERT; 478 local_flags |= DLM_LKF_CONVERT;
463 479
464 lockres->l_requested = level; 480 lockres->l_requested = level;
465 lockres->l_flags |= USER_LOCK_BUSY; 481 lockres->l_flags |= USER_LOCK_BUSY;
466 spin_unlock(&lockres->l_lock); 482 spin_unlock(&lockres->l_lock);
467 483
468 BUG_ON(level == LKM_IVMODE); 484 BUG_ON(level == DLM_LOCK_IV);
469 BUG_ON(level == LKM_NLMODE); 485 BUG_ON(level == DLM_LOCK_NL);
470 486
471 /* call dlm_lock to upgrade lock now */ 487 /* call dlm_lock to upgrade lock now */
472 status = dlmlock(dlm, 488 status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb,
473 level, 489 local_flags, lockres->l_name,
474 &lockres->l_lksb, 490 lockres->l_namelen);
475 local_flags, 491 if (status) {
476 lockres->l_name, 492 if ((lkm_flags & DLM_LKF_NOQUEUE) &&
477 lockres->l_namelen, 493 (status != -EAGAIN))
478 user_ast, 494 user_log_dlm_error("ocfs2_dlm_lock",
479 lockres, 495 status, lockres);
480 user_bast);
481 if (status != DLM_NORMAL) {
482 if ((lkm_flags & LKM_NOQUEUE) &&
483 (status == DLM_NOTQUEUED))
484 status = -EAGAIN;
485 else {
486 user_log_dlm_error("dlmlock", status, lockres);
487 status = -EINVAL;
488 }
489 user_recover_from_dlm_error(lockres); 496 user_recover_from_dlm_error(lockres);
490 goto bail; 497 goto bail;
491 } 498 }
@@ -506,11 +513,11 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
506 int level) 513 int level)
507{ 514{
508 switch(level) { 515 switch(level) {
509 case LKM_EXMODE: 516 case DLM_LOCK_EX:
510 BUG_ON(!lockres->l_ex_holders); 517 BUG_ON(!lockres->l_ex_holders);
511 lockres->l_ex_holders--; 518 lockres->l_ex_holders--;
512 break; 519 break;
513 case LKM_PRMODE: 520 case DLM_LOCK_PR:
514 BUG_ON(!lockres->l_ro_holders); 521 BUG_ON(!lockres->l_ro_holders);
515 lockres->l_ro_holders--; 522 lockres->l_ro_holders--;
516 break; 523 break;
@@ -522,8 +529,8 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
522void user_dlm_cluster_unlock(struct user_lock_res *lockres, 529void user_dlm_cluster_unlock(struct user_lock_res *lockres,
523 int level) 530 int level)
524{ 531{
525 if (level != LKM_EXMODE && 532 if (level != DLM_LOCK_EX &&
526 level != LKM_PRMODE) { 533 level != DLM_LOCK_PR) {
527 mlog(ML_ERROR, "lockres %.*s: invalid request!\n", 534 mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
528 lockres->l_namelen, lockres->l_name); 535 lockres->l_namelen, lockres->l_name);
529 return; 536 return;
@@ -540,33 +547,40 @@ void user_dlm_write_lvb(struct inode *inode,
540 unsigned int len) 547 unsigned int len)
541{ 548{
542 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; 549 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
543 char *lvb = lockres->l_lksb.lvb; 550 char *lvb;
544 551
545 BUG_ON(len > DLM_LVB_LEN); 552 BUG_ON(len > DLM_LVB_LEN);
546 553
547 spin_lock(&lockres->l_lock); 554 spin_lock(&lockres->l_lock);
548 555
549 BUG_ON(lockres->l_level < LKM_EXMODE); 556 BUG_ON(lockres->l_level < DLM_LOCK_EX);
557 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
550 memcpy(lvb, val, len); 558 memcpy(lvb, val, len);
551 559
552 spin_unlock(&lockres->l_lock); 560 spin_unlock(&lockres->l_lock);
553} 561}
554 562
555void user_dlm_read_lvb(struct inode *inode, 563ssize_t user_dlm_read_lvb(struct inode *inode,
556 char *val, 564 char *val,
557 unsigned int len) 565 unsigned int len)
558{ 566{
559 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; 567 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
560 char *lvb = lockres->l_lksb.lvb; 568 char *lvb;
569 ssize_t ret = len;
561 570
562 BUG_ON(len > DLM_LVB_LEN); 571 BUG_ON(len > DLM_LVB_LEN);
563 572
564 spin_lock(&lockres->l_lock); 573 spin_lock(&lockres->l_lock);
565 574
566 BUG_ON(lockres->l_level < LKM_PRMODE); 575 BUG_ON(lockres->l_level < DLM_LOCK_PR);
567 memcpy(val, lvb, len); 576 if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) {
577 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
578 memcpy(val, lvb, len);
579 } else
580 ret = 0;
568 581
569 spin_unlock(&lockres->l_lock); 582 spin_unlock(&lockres->l_lock);
583 return ret;
570} 584}
571 585
572void user_dlm_lock_res_init(struct user_lock_res *lockres, 586void user_dlm_lock_res_init(struct user_lock_res *lockres,
@@ -576,9 +590,9 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
576 590
577 spin_lock_init(&lockres->l_lock); 591 spin_lock_init(&lockres->l_lock);
578 init_waitqueue_head(&lockres->l_event); 592 init_waitqueue_head(&lockres->l_event);
579 lockres->l_level = LKM_IVMODE; 593 lockres->l_level = DLM_LOCK_IV;
580 lockres->l_requested = LKM_IVMODE; 594 lockres->l_requested = DLM_LOCK_IV;
581 lockres->l_blocking = LKM_IVMODE; 595 lockres->l_blocking = DLM_LOCK_IV;
582 596
583 /* should have been checked before getting here. */ 597 /* should have been checked before getting here. */
584 BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN); 598 BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
@@ -592,9 +606,10 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
592int user_dlm_destroy_lock(struct user_lock_res *lockres) 606int user_dlm_destroy_lock(struct user_lock_res *lockres)
593{ 607{
594 int status = -EBUSY; 608 int status = -EBUSY;
595 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); 609 struct ocfs2_cluster_connection *conn =
610 cluster_connection_from_user_lockres(lockres);
596 611
597 mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name); 612 mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
598 613
599 spin_lock(&lockres->l_lock); 614 spin_lock(&lockres->l_lock);
600 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { 615 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
@@ -627,14 +642,9 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
627 lockres->l_flags |= USER_LOCK_BUSY; 642 lockres->l_flags |= USER_LOCK_BUSY;
628 spin_unlock(&lockres->l_lock); 643 spin_unlock(&lockres->l_lock);
629 644
630 status = dlmunlock(dlm, 645 status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
631 &lockres->l_lksb, 646 if (status) {
632 LKM_VALBLK, 647 user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
633 user_unlock_ast,
634 lockres);
635 if (status != DLM_NORMAL) {
636 user_log_dlm_error("dlmunlock", status, lockres);
637 status = -EINVAL;
638 goto bail; 648 goto bail;
639 } 649 }
640 650
@@ -645,32 +655,34 @@ bail:
645 return status; 655 return status;
646} 656}
647 657
648struct dlm_ctxt *user_dlm_register_context(struct qstr *name, 658static void user_dlm_recovery_handler_noop(int node_num,
649 struct dlm_protocol_version *proto) 659 void *recovery_data)
650{ 660{
651 struct dlm_ctxt *dlm; 661 /* We ignore recovery events */
652 u32 dlm_key; 662 return;
653 char *domain; 663}
654
655 domain = kmalloc(name->len + 1, GFP_NOFS);
656 if (!domain) {
657 mlog_errno(-ENOMEM);
658 return ERR_PTR(-ENOMEM);
659 }
660 664
661 dlm_key = crc32_le(0, name->name, name->len); 665void user_dlm_set_locking_protocol(void)
666{
667 ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version);
668}
662 669
663 snprintf(domain, name->len + 1, "%.*s", name->len, name->name); 670struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name)
671{
672 int rc;
673 struct ocfs2_cluster_connection *conn;
664 674
665 dlm = dlm_register_domain(domain, dlm_key, proto); 675 rc = ocfs2_cluster_connect_agnostic(name->name, name->len,
666 if (IS_ERR(dlm)) 676 &user_dlm_lproto,
667 mlog_errno(PTR_ERR(dlm)); 677 user_dlm_recovery_handler_noop,
678 NULL, &conn);
679 if (rc)
680 mlog_errno(rc);
668 681
669 kfree(domain); 682 return rc ? ERR_PTR(rc) : conn;
670 return dlm;
671} 683}
672 684
673void user_dlm_unregister_context(struct dlm_ctxt *dlm) 685void user_dlm_unregister(struct ocfs2_cluster_connection *conn)
674{ 686{
675 dlm_unregister_domain(dlm); 687 ocfs2_cluster_disconnect(conn, 0);
676} 688}
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
index 0c3cc03c61fa..3b42d79531d7 100644
--- a/fs/ocfs2/dlm/userdlm.h
+++ b/fs/ocfs2/dlmfs/userdlm.h
@@ -57,7 +57,7 @@ struct user_lock_res {
57 int l_level; 57 int l_level;
58 unsigned int l_ro_holders; 58 unsigned int l_ro_holders;
59 unsigned int l_ex_holders; 59 unsigned int l_ex_holders;
60 struct dlm_lockstatus l_lksb; 60 struct ocfs2_dlm_lksb l_lksb;
61 61
62 int l_requested; 62 int l_requested;
63 int l_blocking; 63 int l_blocking;
@@ -80,15 +80,15 @@ void user_dlm_cluster_unlock(struct user_lock_res *lockres,
80void user_dlm_write_lvb(struct inode *inode, 80void user_dlm_write_lvb(struct inode *inode,
81 const char *val, 81 const char *val,
82 unsigned int len); 82 unsigned int len);
83void user_dlm_read_lvb(struct inode *inode, 83ssize_t user_dlm_read_lvb(struct inode *inode,
84 char *val, 84 char *val,
85 unsigned int len); 85 unsigned int len);
86struct dlm_ctxt *user_dlm_register_context(struct qstr *name, 86struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name);
87 struct dlm_protocol_version *proto); 87void user_dlm_unregister(struct ocfs2_cluster_connection *conn);
88void user_dlm_unregister_context(struct dlm_ctxt *dlm); 88void user_dlm_set_locking_protocol(void);
89 89
90struct dlmfs_inode_private { 90struct dlmfs_inode_private {
91 struct dlm_ctxt *ip_dlm; 91 struct ocfs2_cluster_connection *ip_conn;
92 92
93 struct user_lock_res ip_lockres; /* unused for directories. */ 93 struct user_lock_res ip_lockres; /* unused for directories. */
94 struct inode *ip_parent; 94 struct inode *ip_parent;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index c5e4a49e3a12..50c4ee805da4 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -297,6 +297,11 @@ static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
297 lockres->l_type == OCFS2_LOCK_TYPE_OPEN; 297 lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
298} 298}
299 299
300static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
301{
302 return container_of(lksb, struct ocfs2_lock_res, l_lksb);
303}
304
300static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) 305static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
301{ 306{
302 BUG_ON(!ocfs2_is_inode_lock(lockres)); 307 BUG_ON(!ocfs2_is_inode_lock(lockres));
@@ -875,6 +880,14 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
875 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 880 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
876 881
877 lockres->l_level = lockres->l_requested; 882 lockres->l_level = lockres->l_requested;
883
884 /*
885 * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing
886 * the OCFS2_LOCK_BUSY flag to prevent the dc thread from
887 * downconverting the lock before the upconvert has fully completed.
888 */
889 lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
890
878 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 891 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
879 892
880 mlog_exit_void(); 893 mlog_exit_void();
@@ -907,8 +920,6 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
907 920
908 assert_spin_locked(&lockres->l_lock); 921 assert_spin_locked(&lockres->l_lock);
909 922
910 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
911
912 if (level > lockres->l_blocking) { 923 if (level > lockres->l_blocking) {
913 /* only schedule a downconvert if we haven't already scheduled 924 /* only schedule a downconvert if we haven't already scheduled
914 * one that goes low enough to satisfy the level we're 925 * one that goes low enough to satisfy the level we're
@@ -921,6 +932,13 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
921 lockres->l_blocking = level; 932 lockres->l_blocking = level;
922 } 933 }
923 934
935 mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n",
936 lockres->l_name, level, lockres->l_level, lockres->l_blocking,
937 needs_downconvert);
938
939 if (needs_downconvert)
940 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
941
924 mlog_exit(needs_downconvert); 942 mlog_exit(needs_downconvert);
925 return needs_downconvert; 943 return needs_downconvert;
926} 944}
@@ -1031,18 +1049,17 @@ static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
1031 return lockres->l_pending_gen; 1049 return lockres->l_pending_gen;
1032} 1050}
1033 1051
1034 1052static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
1035static void ocfs2_blocking_ast(void *opaque, int level)
1036{ 1053{
1037 struct ocfs2_lock_res *lockres = opaque; 1054 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1038 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); 1055 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1039 int needs_downconvert; 1056 int needs_downconvert;
1040 unsigned long flags; 1057 unsigned long flags;
1041 1058
1042 BUG_ON(level <= DLM_LOCK_NL); 1059 BUG_ON(level <= DLM_LOCK_NL);
1043 1060
1044 mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n", 1061 mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, "
1045 lockres->l_name, level, lockres->l_level, 1062 "type %s\n", lockres->l_name, level, lockres->l_level,
1046 ocfs2_lock_type_string(lockres->l_type)); 1063 ocfs2_lock_type_string(lockres->l_type));
1047 1064
1048 /* 1065 /*
@@ -1063,9 +1080,9 @@ static void ocfs2_blocking_ast(void *opaque, int level)
1063 ocfs2_wake_downconvert_thread(osb); 1080 ocfs2_wake_downconvert_thread(osb);
1064} 1081}
1065 1082
1066static void ocfs2_locking_ast(void *opaque) 1083static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
1067{ 1084{
1068 struct ocfs2_lock_res *lockres = opaque; 1085 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1069 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); 1086 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1070 unsigned long flags; 1087 unsigned long flags;
1071 int status; 1088 int status;
@@ -1086,6 +1103,10 @@ static void ocfs2_locking_ast(void *opaque)
1086 return; 1103 return;
1087 } 1104 }
1088 1105
1106 mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, "
1107 "level %d => %d\n", lockres->l_name, lockres->l_action,
1108 lockres->l_unlock_action, lockres->l_level, lockres->l_requested);
1109
1089 switch(lockres->l_action) { 1110 switch(lockres->l_action) {
1090 case OCFS2_AST_ATTACH: 1111 case OCFS2_AST_ATTACH:
1091 ocfs2_generic_handle_attach_action(lockres); 1112 ocfs2_generic_handle_attach_action(lockres);
@@ -1098,8 +1119,8 @@ static void ocfs2_locking_ast(void *opaque)
1098 ocfs2_generic_handle_downconvert_action(lockres); 1119 ocfs2_generic_handle_downconvert_action(lockres);
1099 break; 1120 break;
1100 default: 1121 default:
1101 mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u " 1122 mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, "
1102 "lockres flags = 0x%lx, unlock action: %u\n", 1123 "flags 0x%lx, unlock: %u\n",
1103 lockres->l_name, lockres->l_action, lockres->l_flags, 1124 lockres->l_name, lockres->l_action, lockres->l_flags,
1104 lockres->l_unlock_action); 1125 lockres->l_unlock_action);
1105 BUG(); 1126 BUG();
@@ -1125,6 +1146,88 @@ out:
1125 spin_unlock_irqrestore(&lockres->l_lock, flags); 1146 spin_unlock_irqrestore(&lockres->l_lock, flags);
1126} 1147}
1127 1148
1149static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
1150{
1151 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1152 unsigned long flags;
1153
1154 mlog_entry_void();
1155
1156 mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
1157 lockres->l_name, lockres->l_unlock_action);
1158
1159 spin_lock_irqsave(&lockres->l_lock, flags);
1160 if (error) {
1161 mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
1162 "unlock_action %d\n", error, lockres->l_name,
1163 lockres->l_unlock_action);
1164 spin_unlock_irqrestore(&lockres->l_lock, flags);
1165 mlog_exit_void();
1166 return;
1167 }
1168
1169 switch(lockres->l_unlock_action) {
1170 case OCFS2_UNLOCK_CANCEL_CONVERT:
1171 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
1172 lockres->l_action = OCFS2_AST_INVALID;
1173 /* Downconvert thread may have requeued this lock, we
1174 * need to wake it. */
1175 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
1176 ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
1177 break;
1178 case OCFS2_UNLOCK_DROP_LOCK:
1179 lockres->l_level = DLM_LOCK_IV;
1180 break;
1181 default:
1182 BUG();
1183 }
1184
1185 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
1186 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
1187 wake_up(&lockres->l_event);
1188 spin_unlock_irqrestore(&lockres->l_lock, flags);
1189
1190 mlog_exit_void();
1191}
1192
1193/*
1194 * This is the filesystem locking protocol. It provides the lock handling
1195 * hooks for the underlying DLM. It has a maximum version number.
1196 * The version number allows interoperability with systems running at
1197 * the same major number and an equal or smaller minor number.
1198 *
1199 * Whenever the filesystem does new things with locks (adds or removes a
1200 * lock, orders them differently, does different things underneath a lock),
1201 * the version must be changed. The protocol is negotiated when joining
1202 * the dlm domain. A node may join the domain if its major version is
1203 * identical to all other nodes and its minor version is greater than
1204 * or equal to all other nodes. When its minor version is greater than
1205 * the other nodes, it will run at the minor version specified by the
1206 * other nodes.
1207 *
1208 * If a locking change is made that will not be compatible with older
1209 * versions, the major number must be increased and the minor version set
1210 * to zero. If a change merely adds a behavior that can be disabled when
1211 * speaking to older versions, the minor version must be increased. If a
1212 * change adds a fully backwards compatible change (eg, LVB changes that
1213 * are just ignored by older versions), the version does not need to be
1214 * updated.
1215 */
1216static struct ocfs2_locking_protocol lproto = {
1217 .lp_max_version = {
1218 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
1219 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
1220 },
1221 .lp_lock_ast = ocfs2_locking_ast,
1222 .lp_blocking_ast = ocfs2_blocking_ast,
1223 .lp_unlock_ast = ocfs2_unlock_ast,
1224};
1225
1226void ocfs2_set_locking_protocol(void)
1227{
1228 ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version);
1229}
1230
1128static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, 1231static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
1129 int convert) 1232 int convert)
1130{ 1233{
@@ -1133,6 +1236,7 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
1133 mlog_entry_void(); 1236 mlog_entry_void();
1134 spin_lock_irqsave(&lockres->l_lock, flags); 1237 spin_lock_irqsave(&lockres->l_lock, flags);
1135 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 1238 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
1239 lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
1136 if (convert) 1240 if (convert)
1137 lockres->l_action = OCFS2_AST_INVALID; 1241 lockres->l_action = OCFS2_AST_INVALID;
1138 else 1242 else
@@ -1179,8 +1283,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
1179 &lockres->l_lksb, 1283 &lockres->l_lksb,
1180 dlm_flags, 1284 dlm_flags,
1181 lockres->l_name, 1285 lockres->l_name,
1182 OCFS2_LOCK_ID_MAX_LEN - 1, 1286 OCFS2_LOCK_ID_MAX_LEN - 1);
1183 lockres);
1184 lockres_clear_pending(lockres, gen, osb); 1287 lockres_clear_pending(lockres, gen, osb);
1185 if (ret) { 1288 if (ret) {
1186 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 1289 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -1323,13 +1426,13 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
1323again: 1426again:
1324 wait = 0; 1427 wait = 0;
1325 1428
1429 spin_lock_irqsave(&lockres->l_lock, flags);
1430
1326 if (catch_signals && signal_pending(current)) { 1431 if (catch_signals && signal_pending(current)) {
1327 ret = -ERESTARTSYS; 1432 ret = -ERESTARTSYS;
1328 goto out; 1433 goto unlock;
1329 } 1434 }
1330 1435
1331 spin_lock_irqsave(&lockres->l_lock, flags);
1332
1333 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING, 1436 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
1334 "Cluster lock called on freeing lockres %s! flags " 1437 "Cluster lock called on freeing lockres %s! flags "
1335 "0x%lx\n", lockres->l_name, lockres->l_flags); 1438 "0x%lx\n", lockres->l_name, lockres->l_flags);
@@ -1346,6 +1449,25 @@ again:
1346 goto unlock; 1449 goto unlock;
1347 } 1450 }
1348 1451
1452 if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) {
1453 /*
1454 * We've upconverted. If the lock now has a level we can
1455 * work with, we take it. If, however, the lock is not at the
1456 * required level, we go thru the full cycle. One way this could
1457 * happen is if a process requesting an upconvert to PR is
1458 * closely followed by another requesting upconvert to an EX.
1459 * If the process requesting EX lands here, we want it to
1460 * continue attempting to upconvert and let the process
1461 * requesting PR take the lock.
1462 * If multiple processes request upconvert to PR, the first one
1463 * here will take the lock. The others will have to go thru the
1464 * OCFS2_LOCK_BLOCKED check to ensure that there is no pending
1465 * downconvert request.
1466 */
1467 if (level <= lockres->l_level)
1468 goto update_holders;
1469 }
1470
1349 if (lockres->l_flags & OCFS2_LOCK_BLOCKED && 1471 if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
1350 !ocfs2_may_continue_on_blocked_lock(lockres, level)) { 1472 !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
1351 /* is the lock is currently blocked on behalf of 1473 /* is the lock is currently blocked on behalf of
@@ -1383,7 +1505,7 @@ again:
1383 BUG_ON(level == DLM_LOCK_IV); 1505 BUG_ON(level == DLM_LOCK_IV);
1384 BUG_ON(level == DLM_LOCK_NL); 1506 BUG_ON(level == DLM_LOCK_NL);
1385 1507
1386 mlog(0, "lock %s, convert from %d to level = %d\n", 1508 mlog(ML_BASTS, "lockres %s, convert from %d to %d\n",
1387 lockres->l_name, lockres->l_level, level); 1509 lockres->l_name, lockres->l_level, level);
1388 1510
1389 /* call dlm_lock to upgrade lock now */ 1511 /* call dlm_lock to upgrade lock now */
@@ -1392,8 +1514,7 @@ again:
1392 &lockres->l_lksb, 1514 &lockres->l_lksb,
1393 lkm_flags, 1515 lkm_flags,
1394 lockres->l_name, 1516 lockres->l_name,
1395 OCFS2_LOCK_ID_MAX_LEN - 1, 1517 OCFS2_LOCK_ID_MAX_LEN - 1);
1396 lockres);
1397 lockres_clear_pending(lockres, gen, osb); 1518 lockres_clear_pending(lockres, gen, osb);
1398 if (ret) { 1519 if (ret) {
1399 if (!(lkm_flags & DLM_LKF_NOQUEUE) || 1520 if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
@@ -1416,11 +1537,14 @@ again:
1416 goto again; 1537 goto again;
1417 } 1538 }
1418 1539
1540update_holders:
1419 /* Ok, if we get here then we're good to go. */ 1541 /* Ok, if we get here then we're good to go. */
1420 ocfs2_inc_holders(lockres, level); 1542 ocfs2_inc_holders(lockres, level);
1421 1543
1422 ret = 0; 1544 ret = 0;
1423unlock: 1545unlock:
1546 lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
1547
1424 spin_unlock_irqrestore(&lockres->l_lock, flags); 1548 spin_unlock_irqrestore(&lockres->l_lock, flags);
1425out: 1549out:
1426 /* 1550 /*
@@ -1757,7 +1881,7 @@ out:
1757 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of 1881 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
1758 * flock() calls. The locking approach this requires is sufficiently 1882 * flock() calls. The locking approach this requires is sufficiently
1759 * different from all other cluster lock types that we implement a 1883 * different from all other cluster lock types that we implement a
1760 * seperate path to the "low-level" dlm calls. In particular: 1884 * separate path to the "low-level" dlm calls. In particular:
1761 * 1885 *
1762 * - No optimization of lock levels is done - we take at exactly 1886 * - No optimization of lock levels is done - we take at exactly
1763 * what's been requested. 1887 * what's been requested.
@@ -1827,8 +1951,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1827 spin_unlock_irqrestore(&lockres->l_lock, flags); 1951 spin_unlock_irqrestore(&lockres->l_lock, flags);
1828 1952
1829 ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags, 1953 ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
1830 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, 1954 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1);
1831 lockres);
1832 if (ret) { 1955 if (ret) {
1833 if (!trylock || (ret != -EAGAIN)) { 1956 if (!trylock || (ret != -EAGAIN)) {
1834 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 1957 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -2957,7 +3080,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2957 status = ocfs2_cluster_connect(osb->osb_cluster_stack, 3080 status = ocfs2_cluster_connect(osb->osb_cluster_stack,
2958 osb->uuid_str, 3081 osb->uuid_str,
2959 strlen(osb->uuid_str), 3082 strlen(osb->uuid_str),
2960 ocfs2_do_node_down, osb, 3083 &lproto, ocfs2_do_node_down, osb,
2961 &conn); 3084 &conn);
2962 if (status) { 3085 if (status) {
2963 mlog_errno(status); 3086 mlog_errno(status);
@@ -3024,50 +3147,6 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
3024 mlog_exit_void(); 3147 mlog_exit_void();
3025} 3148}
3026 3149
3027static void ocfs2_unlock_ast(void *opaque, int error)
3028{
3029 struct ocfs2_lock_res *lockres = opaque;
3030 unsigned long flags;
3031
3032 mlog_entry_void();
3033
3034 mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
3035 lockres->l_unlock_action);
3036
3037 spin_lock_irqsave(&lockres->l_lock, flags);
3038 if (error) {
3039 mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
3040 "unlock_action %d\n", error, lockres->l_name,
3041 lockres->l_unlock_action);
3042 spin_unlock_irqrestore(&lockres->l_lock, flags);
3043 mlog_exit_void();
3044 return;
3045 }
3046
3047 switch(lockres->l_unlock_action) {
3048 case OCFS2_UNLOCK_CANCEL_CONVERT:
3049 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
3050 lockres->l_action = OCFS2_AST_INVALID;
3051 /* Downconvert thread may have requeued this lock, we
3052 * need to wake it. */
3053 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
3054 ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
3055 break;
3056 case OCFS2_UNLOCK_DROP_LOCK:
3057 lockres->l_level = DLM_LOCK_IV;
3058 break;
3059 default:
3060 BUG();
3061 }
3062
3063 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
3064 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
3065 wake_up(&lockres->l_event);
3066 spin_unlock_irqrestore(&lockres->l_lock, flags);
3067
3068 mlog_exit_void();
3069}
3070
3071static int ocfs2_drop_lock(struct ocfs2_super *osb, 3150static int ocfs2_drop_lock(struct ocfs2_super *osb,
3072 struct ocfs2_lock_res *lockres) 3151 struct ocfs2_lock_res *lockres)
3073{ 3152{
@@ -3135,8 +3214,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
3135 3214
3136 mlog(0, "lock %s\n", lockres->l_name); 3215 mlog(0, "lock %s\n", lockres->l_name);
3137 3216
3138 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags, 3217 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags);
3139 lockres);
3140 if (ret) { 3218 if (ret) {
3141 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); 3219 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
3142 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); 3220 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
@@ -3155,7 +3233,7 @@ out:
3155/* Mark the lockres as being dropped. It will no longer be 3233/* Mark the lockres as being dropped. It will no longer be
3156 * queued if blocking, but we still may have to wait on it 3234 * queued if blocking, but we still may have to wait on it
3157 * being dequeued from the downconvert thread before we can consider 3235 * being dequeued from the downconvert thread before we can consider
3158 * it safe to drop. 3236 * it safe to drop.
3159 * 3237 *
3160 * You can *not* attempt to call cluster_lock on this lockres anymore. */ 3238 * You can *not* attempt to call cluster_lock on this lockres anymore. */
3161void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) 3239void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
@@ -3244,13 +3322,20 @@ static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
3244 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL); 3322 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
3245 3323
3246 if (lockres->l_level <= new_level) { 3324 if (lockres->l_level <= new_level) {
3247 mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n", 3325 mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, "
3248 lockres->l_level, new_level); 3326 "type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, "
3327 "block %d, pgen %d\n", lockres->l_name, lockres->l_level,
3328 new_level, list_empty(&lockres->l_blocked_list),
3329 list_empty(&lockres->l_mask_waiters), lockres->l_type,
3330 lockres->l_flags, lockres->l_ro_holders,
3331 lockres->l_ex_holders, lockres->l_action,
3332 lockres->l_unlock_action, lockres->l_requested,
3333 lockres->l_blocking, lockres->l_pending_gen);
3249 BUG(); 3334 BUG();
3250 } 3335 }
3251 3336
3252 mlog(0, "lock %s, new_level = %d, l_blocking = %d\n", 3337 mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n",
3253 lockres->l_name, new_level, lockres->l_blocking); 3338 lockres->l_name, lockres->l_level, new_level, lockres->l_blocking);
3254 3339
3255 lockres->l_action = OCFS2_AST_DOWNCONVERT; 3340 lockres->l_action = OCFS2_AST_DOWNCONVERT;
3256 lockres->l_requested = new_level; 3341 lockres->l_requested = new_level;
@@ -3269,6 +3354,9 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
3269 3354
3270 mlog_entry_void(); 3355 mlog_entry_void();
3271 3356
3357 mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
3358 lockres->l_level, new_level);
3359
3272 if (lvb) 3360 if (lvb)
3273 dlm_flags |= DLM_LKF_VALBLK; 3361 dlm_flags |= DLM_LKF_VALBLK;
3274 3362
@@ -3277,8 +3365,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
3277 &lockres->l_lksb, 3365 &lockres->l_lksb,
3278 dlm_flags, 3366 dlm_flags,
3279 lockres->l_name, 3367 lockres->l_name,
3280 OCFS2_LOCK_ID_MAX_LEN - 1, 3368 OCFS2_LOCK_ID_MAX_LEN - 1);
3281 lockres);
3282 lockres_clear_pending(lockres, generation, osb); 3369 lockres_clear_pending(lockres, generation, osb);
3283 if (ret) { 3370 if (ret) {
3284 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); 3371 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -3299,14 +3386,12 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
3299 assert_spin_locked(&lockres->l_lock); 3386 assert_spin_locked(&lockres->l_lock);
3300 3387
3301 mlog_entry_void(); 3388 mlog_entry_void();
3302 mlog(0, "lock %s\n", lockres->l_name);
3303 3389
3304 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { 3390 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
3305 /* If we're already trying to cancel a lock conversion 3391 /* If we're already trying to cancel a lock conversion
3306 * then just drop the spinlock and allow the caller to 3392 * then just drop the spinlock and allow the caller to
3307 * requeue this lock. */ 3393 * requeue this lock. */
3308 3394 mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name);
3309 mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
3310 return 0; 3395 return 0;
3311 } 3396 }
3312 3397
@@ -3321,6 +3406,8 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
3321 "lock %s, invalid flags: 0x%lx\n", 3406 "lock %s, invalid flags: 0x%lx\n",
3322 lockres->l_name, lockres->l_flags); 3407 lockres->l_name, lockres->l_flags);
3323 3408
3409 mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
3410
3324 return 1; 3411 return 1;
3325} 3412}
3326 3413
@@ -3330,16 +3417,15 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
3330 int ret; 3417 int ret;
3331 3418
3332 mlog_entry_void(); 3419 mlog_entry_void();
3333 mlog(0, "lock %s\n", lockres->l_name);
3334 3420
3335 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, 3421 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
3336 DLM_LKF_CANCEL, lockres); 3422 DLM_LKF_CANCEL);
3337 if (ret) { 3423 if (ret) {
3338 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); 3424 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
3339 ocfs2_recover_from_dlm_error(lockres, 0); 3425 ocfs2_recover_from_dlm_error(lockres, 0);
3340 } 3426 }
3341 3427
3342 mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name); 3428 mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
3343 3429
3344 mlog_exit(ret); 3430 mlog_exit(ret);
3345 return ret; 3431 return ret;
@@ -3352,6 +3438,7 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
3352 unsigned long flags; 3438 unsigned long flags;
3353 int blocking; 3439 int blocking;
3354 int new_level; 3440 int new_level;
3441 int level;
3355 int ret = 0; 3442 int ret = 0;
3356 int set_lvb = 0; 3443 int set_lvb = 0;
3357 unsigned int gen; 3444 unsigned int gen;
@@ -3360,9 +3447,17 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
3360 3447
3361 spin_lock_irqsave(&lockres->l_lock, flags); 3448 spin_lock_irqsave(&lockres->l_lock, flags);
3362 3449
3363 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
3364
3365recheck: 3450recheck:
3451 /*
3452 * Is it still blocking? If not, we have no more work to do.
3453 */
3454 if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) {
3455 BUG_ON(lockres->l_blocking != DLM_LOCK_NL);
3456 spin_unlock_irqrestore(&lockres->l_lock, flags);
3457 ret = 0;
3458 goto leave;
3459 }
3460
3366 if (lockres->l_flags & OCFS2_LOCK_BUSY) { 3461 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
3367 /* XXX 3462 /* XXX
3368 * This is a *big* race. The OCFS2_LOCK_PENDING flag 3463 * This is a *big* race. The OCFS2_LOCK_PENDING flag
@@ -3387,8 +3482,11 @@ recheck:
3387 * at the same time they set OCFS2_DLM_BUSY. They must 3482 * at the same time they set OCFS2_DLM_BUSY. They must
3388 * clear OCFS2_DLM_PENDING after dlm_lock() returns. 3483 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
3389 */ 3484 */
3390 if (lockres->l_flags & OCFS2_LOCK_PENDING) 3485 if (lockres->l_flags & OCFS2_LOCK_PENDING) {
3486 mlog(ML_BASTS, "lockres %s, ReQ: Pending\n",
3487 lockres->l_name);
3391 goto leave_requeue; 3488 goto leave_requeue;
3489 }
3392 3490
3393 ctl->requeue = 1; 3491 ctl->requeue = 1;
3394 ret = ocfs2_prepare_cancel_convert(osb, lockres); 3492 ret = ocfs2_prepare_cancel_convert(osb, lockres);
@@ -3401,31 +3499,70 @@ recheck:
3401 goto leave; 3499 goto leave;
3402 } 3500 }
3403 3501
3502 /*
3503 * This prevents livelocks. OCFS2_LOCK_UPCONVERT_FINISHING flag is
3504 * set when the ast is received for an upconvert just before the
3505 * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast
3506 * on the heels of the ast, we want to delay the downconvert just
3507 * enough to allow the up requestor to do its task. Because this
3508 * lock is in the blocked queue, the lock will be downconverted
3509 * as soon as the requestor is done with the lock.
3510 */
3511 if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING)
3512 goto leave_requeue;
3513
3514 /*
3515 * How can we block and yet be at NL? We were trying to upconvert
3516 * from NL and got canceled. The code comes back here, and now
3517 * we notice and clear BLOCKING.
3518 */
3519 if (lockres->l_level == DLM_LOCK_NL) {
3520 BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders);
3521 mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name);
3522 lockres->l_blocking = DLM_LOCK_NL;
3523 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
3524 spin_unlock_irqrestore(&lockres->l_lock, flags);
3525 goto leave;
3526 }
3527
3404 /* if we're blocking an exclusive and we have *any* holders, 3528 /* if we're blocking an exclusive and we have *any* holders,
3405 * then requeue. */ 3529 * then requeue. */
3406 if ((lockres->l_blocking == DLM_LOCK_EX) 3530 if ((lockres->l_blocking == DLM_LOCK_EX)
3407 && (lockres->l_ex_holders || lockres->l_ro_holders)) 3531 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
3532 mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n",
3533 lockres->l_name, lockres->l_ex_holders,
3534 lockres->l_ro_holders);
3408 goto leave_requeue; 3535 goto leave_requeue;
3536 }
3409 3537
3410 /* If it's a PR we're blocking, then only 3538 /* If it's a PR we're blocking, then only
3411 * requeue if we've got any EX holders */ 3539 * requeue if we've got any EX holders */
3412 if (lockres->l_blocking == DLM_LOCK_PR && 3540 if (lockres->l_blocking == DLM_LOCK_PR &&
3413 lockres->l_ex_holders) 3541 lockres->l_ex_holders) {
3542 mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n",
3543 lockres->l_name, lockres->l_ex_holders);
3414 goto leave_requeue; 3544 goto leave_requeue;
3545 }
3415 3546
3416 /* 3547 /*
3417 * Can we get a lock in this state if the holder counts are 3548 * Can we get a lock in this state if the holder counts are
3418 * zero? The meta data unblock code used to check this. 3549 * zero? The meta data unblock code used to check this.
3419 */ 3550 */
3420 if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) 3551 if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
3421 && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) 3552 && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) {
3553 mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n",
3554 lockres->l_name);
3422 goto leave_requeue; 3555 goto leave_requeue;
3556 }
3423 3557
3424 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); 3558 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
3425 3559
3426 if (lockres->l_ops->check_downconvert 3560 if (lockres->l_ops->check_downconvert
3427 && !lockres->l_ops->check_downconvert(lockres, new_level)) 3561 && !lockres->l_ops->check_downconvert(lockres, new_level)) {
3562 mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n",
3563 lockres->l_name);
3428 goto leave_requeue; 3564 goto leave_requeue;
3565 }
3429 3566
3430 /* If we get here, then we know that there are no more 3567 /* If we get here, then we know that there are no more
3431 * incompatible holders (and anyone asking for an incompatible 3568 * incompatible holders (and anyone asking for an incompatible
@@ -3438,17 +3575,24 @@ recheck:
3438 * may sleep, so we save off a copy of what we're blocking as 3575 * may sleep, so we save off a copy of what we're blocking as
3439 * it may change while we're not holding the spin lock. */ 3576 * it may change while we're not holding the spin lock. */
3440 blocking = lockres->l_blocking; 3577 blocking = lockres->l_blocking;
3578 level = lockres->l_level;
3441 spin_unlock_irqrestore(&lockres->l_lock, flags); 3579 spin_unlock_irqrestore(&lockres->l_lock, flags);
3442 3580
3443 ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking); 3581 ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
3444 3582
3445 if (ctl->unblock_action == UNBLOCK_STOP_POST) 3583 if (ctl->unblock_action == UNBLOCK_STOP_POST) {
3584 mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n",
3585 lockres->l_name);
3446 goto leave; 3586 goto leave;
3587 }
3447 3588
3448 spin_lock_irqsave(&lockres->l_lock, flags); 3589 spin_lock_irqsave(&lockres->l_lock, flags);
3449 if (blocking != lockres->l_blocking) { 3590 if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) {
3450 /* If this changed underneath us, then we can't drop 3591 /* If this changed underneath us, then we can't drop
3451 * it just yet. */ 3592 * it just yet. */
3593 mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, "
3594 "Recheck\n", lockres->l_name, blocking,
3595 lockres->l_blocking, level, lockres->l_level);
3452 goto recheck; 3596 goto recheck;
3453 } 3597 }
3454 3598
@@ -3843,45 +3987,6 @@ void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
3843 ocfs2_cluster_unlock(osb, lockres, level); 3987 ocfs2_cluster_unlock(osb, lockres, level);
3844} 3988}
3845 3989
3846/*
3847 * This is the filesystem locking protocol. It provides the lock handling
3848 * hooks for the underlying DLM. It has a maximum version number.
3849 * The version number allows interoperability with systems running at
3850 * the same major number and an equal or smaller minor number.
3851 *
3852 * Whenever the filesystem does new things with locks (adds or removes a
3853 * lock, orders them differently, does different things underneath a lock),
3854 * the version must be changed. The protocol is negotiated when joining
3855 * the dlm domain. A node may join the domain if its major version is
3856 * identical to all other nodes and its minor version is greater than
3857 * or equal to all other nodes. When its minor version is greater than
3858 * the other nodes, it will run at the minor version specified by the
3859 * other nodes.
3860 *
3861 * If a locking change is made that will not be compatible with older
3862 * versions, the major number must be increased and the minor version set
3863 * to zero. If a change merely adds a behavior that can be disabled when
3864 * speaking to older versions, the minor version must be increased. If a
3865 * change adds a fully backwards compatible change (eg, LVB changes that
3866 * are just ignored by older versions), the version does not need to be
3867 * updated.
3868 */
3869static struct ocfs2_locking_protocol lproto = {
3870 .lp_max_version = {
3871 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
3872 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
3873 },
3874 .lp_lock_ast = ocfs2_locking_ast,
3875 .lp_blocking_ast = ocfs2_blocking_ast,
3876 .lp_unlock_ast = ocfs2_unlock_ast,
3877};
3878
3879void ocfs2_set_locking_protocol(void)
3880{
3881 ocfs2_stack_glue_set_locking_protocol(&lproto);
3882}
3883
3884
3885static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, 3990static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3886 struct ocfs2_lock_res *lockres) 3991 struct ocfs2_lock_res *lockres)
3887{ 3992{
@@ -3898,7 +4003,7 @@ static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3898 BUG_ON(!lockres); 4003 BUG_ON(!lockres);
3899 BUG_ON(!lockres->l_ops); 4004 BUG_ON(!lockres->l_ops);
3900 4005
3901 mlog(0, "lockres %s blocked.\n", lockres->l_name); 4006 mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name);
3902 4007
3903 /* Detect whether a lock has been marked as going away while 4008 /* Detect whether a lock has been marked as going away while
3904 * the downconvert thread was processing other things. A lock can 4009 * the downconvert thread was processing other things. A lock can
@@ -3921,7 +4026,7 @@ unqueue:
3921 } else 4026 } else
3922 ocfs2_schedule_blocked_lock(osb, lockres); 4027 ocfs2_schedule_blocked_lock(osb, lockres);
3923 4028
3924 mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name, 4029 mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
3925 ctl.requeue ? "yes" : "no"); 4030 ctl.requeue ? "yes" : "no");
3926 spin_unlock_irqrestore(&lockres->l_lock, flags); 4031 spin_unlock_irqrestore(&lockres->l_lock, flags);
3927 4032
@@ -3943,7 +4048,7 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
3943 /* Do not schedule a lock for downconvert when it's on 4048 /* Do not schedule a lock for downconvert when it's on
3944 * the way to destruction - any nodes wanting access 4049 * the way to destruction - any nodes wanting access
3945 * to the resource will get it soon. */ 4050 * to the resource will get it soon. */
3946 mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n", 4051 mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n",
3947 lockres->l_name, lockres->l_flags); 4052 lockres->l_name, lockres->l_flags);
3948 return; 4053 return;
3949 } 4054 }
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 15713cbb865c..19ad145d2af3 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -239,7 +239,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
239 mlog(0, "Encoding parent: blkno: %llu, generation: %u\n", 239 mlog(0, "Encoding parent: blkno: %llu, generation: %u\n",
240 (unsigned long long)blkno, generation); 240 (unsigned long long)blkno, generation);
241 } 241 }
242 242
243 *max_len = len; 243 *max_len = len;
244 244
245bail: 245bail:
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index d35a27f4523e..09e3fdfa6d33 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -24,6 +24,7 @@
24 24
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/slab.h>
27#include <linux/types.h> 28#include <linux/types.h>
28#include <linux/fiemap.h> 29#include <linux/fiemap.h>
29 30
@@ -192,7 +193,7 @@ static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
192 emi->ei_clusters += ins->ei_clusters; 193 emi->ei_clusters += ins->ei_clusters;
193 return 1; 194 return 1;
194 } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys && 195 } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
195 (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys && 196 (ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos &&
196 ins->ei_flags == emi->ei_flags) { 197 ins->ei_flags == emi->ei_flags) {
197 emi->ei_phys = ins->ei_phys; 198 emi->ei_phys = ins->ei_phys;
198 emi->ei_cpos = ins->ei_cpos; 199 emi->ei_cpos = ins->ei_cpos;
@@ -453,7 +454,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
453 if (i == -1) { 454 if (i == -1) {
454 /* 455 /*
455 * Holes can be larger than the maximum size of an 456 * Holes can be larger than the maximum size of an
456 * extent, so we return their lengths in a seperate 457 * extent, so we return their lengths in a separate
457 * field. 458 * field.
458 */ 459 */
459 if (hole_len) { 460 if (hole_len) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3d30a1c974a8..17947dc8341e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -107,6 +107,9 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
107 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 107 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
108 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); 108 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
109 109
110 if (file->f_mode & FMODE_WRITE)
111 dquot_initialize(inode);
112
110 spin_lock(&oi->ip_lock); 113 spin_lock(&oi->ip_lock);
111 114
112 /* Check that the inode hasn't been wiped from disk by another 115 /* Check that the inode hasn't been wiped from disk by another
@@ -629,11 +632,10 @@ restart_all:
629 } 632 }
630 633
631restarted_transaction: 634restarted_transaction:
632 if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 635 status = dquot_alloc_space_nodirty(inode,
633 clusters_to_add))) { 636 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
634 status = -EDQUOT; 637 if (status)
635 goto leave; 638 goto leave;
636 }
637 did_quota = 1; 639 did_quota = 1;
638 640
639 /* reserve a write to the file entry early on - that we if we 641 /* reserve a write to the file entry early on - that we if we
@@ -674,7 +676,7 @@ restarted_transaction:
674 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 676 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
675 spin_unlock(&OCFS2_I(inode)->ip_lock); 677 spin_unlock(&OCFS2_I(inode)->ip_lock);
676 /* Release unused quota reservation */ 678 /* Release unused quota reservation */
677 vfs_dq_free_space(inode, 679 dquot_free_space(inode,
678 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 680 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
679 did_quota = 0; 681 did_quota = 0;
680 682
@@ -710,7 +712,7 @@ restarted_transaction:
710 712
711leave: 713leave:
712 if (status < 0 && did_quota) 714 if (status < 0 && did_quota)
713 vfs_dq_free_space(inode, 715 dquot_free_space(inode,
714 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 716 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
715 if (handle) { 717 if (handle) {
716 ocfs2_commit_trans(osb, handle); 718 ocfs2_commit_trans(osb, handle);
@@ -749,7 +751,7 @@ static int ocfs2_write_zero_page(struct inode *inode,
749 int ret; 751 int ret;
750 752
751 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ 753 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
752 /* ugh. in prepare/commit_write, if from==to==start of block, we 754 /* ugh. in prepare/commit_write, if from==to==start of block, we
753 ** skip the prepare. make sure we never send an offset for the start 755 ** skip the prepare. make sure we never send an offset for the start
754 ** of a block 756 ** of a block
755 */ 757 */
@@ -978,6 +980,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
978 980
979 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 981 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
980 if (size_change) { 982 if (size_change) {
983 dquot_initialize(inode);
984
981 status = ocfs2_rw_lock(inode, 1); 985 status = ocfs2_rw_lock(inode, 1);
982 if (status < 0) { 986 if (status < 0) {
983 mlog_errno(status); 987 mlog_errno(status);
@@ -993,10 +997,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
993 } 997 }
994 998
995 if (size_change && attr->ia_size != i_size_read(inode)) { 999 if (size_change && attr->ia_size != i_size_read(inode)) {
996 if (attr->ia_size > sb->s_maxbytes) { 1000 status = inode_newsize_ok(inode, attr->ia_size);
997 status = -EFBIG; 1001 if (status)
998 goto bail_unlock; 1002 goto bail_unlock;
999 }
1000 1003
1001 if (i_size_read(inode) > attr->ia_size) { 1004 if (i_size_read(inode) > attr->ia_size) {
1002 if (ocfs2_should_order_data(inode)) { 1005 if (ocfs2_should_order_data(inode)) {
@@ -1021,7 +1024,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1021 /* 1024 /*
1022 * Gather pointers to quota structures so that allocation / 1025 * Gather pointers to quota structures so that allocation /
1023 * freeing of quota structures happens here and not inside 1026 * freeing of quota structures happens here and not inside
1024 * vfs_dq_transfer() where we have problems with lock ordering 1027 * dquot_transfer() where we have problems with lock ordering
1025 */ 1028 */
1026 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid 1029 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
1027 && OCFS2_HAS_RO_COMPAT_FEATURE(sb, 1030 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
@@ -1054,7 +1057,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1054 mlog_errno(status); 1057 mlog_errno(status);
1055 goto bail_unlock; 1058 goto bail_unlock;
1056 } 1059 }
1057 status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; 1060 status = dquot_transfer(inode, attr);
1058 if (status < 0) 1061 if (status < 0)
1059 goto bail_commit; 1062 goto bail_commit;
1060 } else { 1063 } else {
@@ -1772,13 +1775,14 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1772 loff_t *ppos, 1775 loff_t *ppos,
1773 size_t count, 1776 size_t count,
1774 int appending, 1777 int appending,
1775 int *direct_io) 1778 int *direct_io,
1779 int *has_refcount)
1776{ 1780{
1777 int ret = 0, meta_level = 0; 1781 int ret = 0, meta_level = 0;
1778 struct inode *inode = dentry->d_inode; 1782 struct inode *inode = dentry->d_inode;
1779 loff_t saved_pos, end; 1783 loff_t saved_pos, end;
1780 1784
1781 /* 1785 /*
1782 * We start with a read level meta lock and only jump to an ex 1786 * We start with a read level meta lock and only jump to an ex
1783 * if we need to make modifications here. 1787 * if we need to make modifications here.
1784 */ 1788 */
@@ -1833,6 +1837,10 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1833 saved_pos, 1837 saved_pos,
1834 count, 1838 count,
1835 &meta_level); 1839 &meta_level);
1840 if (has_refcount)
1841 *has_refcount = 1;
1842 if (direct_io)
1843 *direct_io = 0;
1836 } 1844 }
1837 1845
1838 if (ret < 0) { 1846 if (ret < 0) {
@@ -1899,7 +1907,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1899 loff_t pos) 1907 loff_t pos)
1900{ 1908{
1901 int ret, direct_io, appending, rw_level, have_alloc_sem = 0; 1909 int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
1902 int can_do_direct; 1910 int can_do_direct, has_refcount = 0;
1903 ssize_t written = 0; 1911 ssize_t written = 0;
1904 size_t ocount; /* original count */ 1912 size_t ocount; /* original count */
1905 size_t count; /* after file limit checks */ 1913 size_t count; /* after file limit checks */
@@ -1942,7 +1950,7 @@ relock:
1942 can_do_direct = direct_io; 1950 can_do_direct = direct_io;
1943 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, 1951 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
1944 iocb->ki_left, appending, 1952 iocb->ki_left, appending,
1945 &can_do_direct); 1953 &can_do_direct, &has_refcount);
1946 if (ret < 0) { 1954 if (ret < 0) {
1947 mlog_errno(ret); 1955 mlog_errno(ret);
1948 goto out; 1956 goto out;
@@ -2006,14 +2014,16 @@ out_dio:
2006 /* buffered aio wouldn't have proper lock coverage today */ 2014 /* buffered aio wouldn't have proper lock coverage today */
2007 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 2015 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
2008 2016
2009 if ((file->f_flags & O_DSYNC && !direct_io) || IS_SYNC(inode)) { 2017 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
2018 ((file->f_flags & O_DIRECT) && has_refcount)) {
2010 ret = filemap_fdatawrite_range(file->f_mapping, pos, 2019 ret = filemap_fdatawrite_range(file->f_mapping, pos,
2011 pos + count - 1); 2020 pos + count - 1);
2012 if (ret < 0) 2021 if (ret < 0)
2013 written = ret; 2022 written = ret;
2014 2023
2015 if (!ret && (old_size != i_size_read(inode) || 2024 if (!ret && (old_size != i_size_read(inode) ||
2016 old_clusters != OCFS2_I(inode)->ip_clusters)) { 2025 old_clusters != OCFS2_I(inode)->ip_clusters ||
2026 has_refcount)) {
2017 ret = jbd2_journal_force_commit(osb->journal->j_journal); 2027 ret = jbd2_journal_force_commit(osb->journal->j_journal);
2018 if (ret < 0) 2028 if (ret < 0)
2019 written = ret; 2029 written = ret;
@@ -2024,7 +2034,7 @@ out_dio:
2024 pos + count - 1); 2034 pos + count - 1);
2025 } 2035 }
2026 2036
2027 /* 2037 /*
2028 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 2038 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2029 * function pointer which is called when o_direct io completes so that 2039 * function pointer which is called when o_direct io completes so that
2030 * it can unlock our rw lock. (it's the clustered equivalent of 2040 * it can unlock our rw lock. (it's the clustered equivalent of
@@ -2034,7 +2044,7 @@ out_dio:
2034 * async dio is going to do it in the future or an end_io after an 2044 * async dio is going to do it in the future or an end_io after an
2035 * error has already done it. 2045 * error has already done it.
2036 */ 2046 */
2037 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 2047 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2038 rw_level = -1; 2048 rw_level = -1;
2039 have_alloc_sem = 0; 2049 have_alloc_sem = 0;
2040 } 2050 }
@@ -2062,7 +2072,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
2062 int ret; 2072 int ret;
2063 2073
2064 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, 2074 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
2065 sd->total_len, 0, NULL); 2075 sd->total_len, 0, NULL, NULL);
2066 if (ret < 0) { 2076 if (ret < 0) {
2067 mlog_errno(ret); 2077 mlog_errno(ret);
2068 return ret; 2078 return ret;
@@ -2189,7 +2199,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2189 goto bail; 2199 goto bail;
2190 } 2200 }
2191 2201
2192 /* 2202 /*
2193 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 2203 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
2194 * need locks to protect pending reads from racing with truncate. 2204 * need locks to protect pending reads from racing with truncate.
2195 */ 2205 */
@@ -2211,10 +2221,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2211 * We're fine letting folks race truncates and extending 2221 * We're fine letting folks race truncates and extending
2212 * writes with read across the cluster, just like they can 2222 * writes with read across the cluster, just like they can
2213 * locally. Hence no rw_lock during read. 2223 * locally. Hence no rw_lock during read.
2214 * 2224 *
2215 * Take and drop the meta data lock to update inode fields 2225 * Take and drop the meta data lock to update inode fields
2216 * like i_size. This allows the checks down below 2226 * like i_size. This allows the checks down below
2217 * generic_file_aio_read() a chance of actually working. 2227 * generic_file_aio_read() a chance of actually working.
2218 */ 2228 */
2219 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); 2229 ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2220 if (ret < 0) { 2230 if (ret < 0) {
@@ -2239,7 +2249,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2239bail: 2249bail:
2240 if (have_alloc_sem) 2250 if (have_alloc_sem)
2241 up_read(&inode->i_alloc_sem); 2251 up_read(&inode->i_alloc_sem);
2242 if (rw_level != -1) 2252 if (rw_level != -1)
2243 ocfs2_rw_unlock(inode, rw_level); 2253 ocfs2_rw_unlock(inode, rw_level);
2244 mlog_exit(ret); 2254 mlog_exit(ret);
2245 2255
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c6e7213db868..1aa863dd901f 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h> 29#include <linux/highmem.h>
31 30
32#define MLOG_MASK_PREFIX ML_SUPER 31#define MLOG_MASK_PREFIX ML_SUPER
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 0297fb8982b8..07cc8bb68b6d 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/quotaops.h> 30#include <linux/quotaops.h>
@@ -475,7 +474,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
475 if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) { 474 if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
476 status = ocfs2_try_open_lock(inode, 0); 475 status = ocfs2_try_open_lock(inode, 0);
477 if (status) { 476 if (status) {
478 make_bad_inode(inode); 477 make_bad_inode(inode);
479 return status; 478 return status;
480 } 479 }
481 } 480 }
@@ -665,7 +664,7 @@ static int ocfs2_remove_inode(struct inode *inode,
665 } 664 }
666 665
667 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); 666 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
668 vfs_dq_free_inode(inode); 667 dquot_free_inode(inode);
669 668
670 status = ocfs2_free_dinode(handle, inode_alloc_inode, 669 status = ocfs2_free_dinode(handle, inode_alloc_inode,
671 inode_alloc_bh, di); 670 inode_alloc_bh, di);
@@ -684,7 +683,7 @@ bail:
684 return status; 683 return status;
685} 684}
686 685
687/* 686/*
688 * Serialize with orphan dir recovery. If the process doing 687 * Serialize with orphan dir recovery. If the process doing
689 * recovery on this orphan dir does an iget() with the dir 688 * recovery on this orphan dir does an iget() with the dir
690 * i_mutex held, we'll deadlock here. Instead we detect this 689 * i_mutex held, we'll deadlock here. Instead we detect this
@@ -891,6 +890,21 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
891 /* Do some basic inode verification... */ 890 /* Do some basic inode verification... */
892 di = (struct ocfs2_dinode *) di_bh->b_data; 891 di = (struct ocfs2_dinode *) di_bh->b_data;
893 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) { 892 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
893 /*
894 * Inodes in the orphan dir must have ORPHANED_FL. The only
895 * inodes that come back out of the orphan dir are reflink
896 * targets. A reflink target may be moved out of the orphan
897 * dir between the time we scan the directory and the time we
898 * process it. This would lead to HAS_REFCOUNT_FL being set but
899 * ORPHANED_FL not.
900 */
901 if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
902 mlog(0, "Reflinked inode %llu is no longer orphaned. "
903 "it shouldn't be deleted\n",
904 (unsigned long long)oi->ip_blkno);
905 goto bail;
906 }
907
894 /* for lack of a better error? */ 908 /* for lack of a better error? */
895 status = -EEXIST; 909 status = -EEXIST;
896 mlog(ML_ERROR, 910 mlog(ML_ERROR,
@@ -971,6 +985,8 @@ void ocfs2_delete_inode(struct inode *inode)
971 goto bail; 985 goto bail;
972 } 986 }
973 987
988 dquot_initialize(inode);
989
974 if (!ocfs2_inode_is_valid_to_delete(inode)) { 990 if (!ocfs2_inode_is_valid_to_delete(inode)) {
975 /* It's probably not necessary to truncate_inode_pages 991 /* It's probably not necessary to truncate_inode_pages
976 * here but we do it for safety anyway (it will most 992 * here but we do it for safety anyway (it will most
@@ -1087,6 +1103,8 @@ void ocfs2_clear_inode(struct inode *inode)
1087 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, 1103 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
1088 "Inode=%lu\n", inode->i_ino); 1104 "Inode=%lu\n", inode->i_ino);
1089 1105
1106 dquot_drop(inode);
1107
1090 /* To preven remote deletes we hold open lock before, now it 1108 /* To preven remote deletes we hold open lock before, now it
1091 * is time to unlock PR and EX open locks. */ 1109 * is time to unlock PR and EX open locks. */
1092 ocfs2_open_unlock(inode); 1110 ocfs2_open_unlock(inode);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 31fbb0619510..7d9d9c132cef 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/compat.h>
10 11
11#define MLOG_MASK_PREFIX ML_INODE 12#define MLOG_MASK_PREFIX ML_INODE
12#include <cluster/masklog.h> 13#include <cluster/masklog.h>
@@ -181,6 +182,10 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
181#ifdef CONFIG_COMPAT 182#ifdef CONFIG_COMPAT
182long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) 183long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
183{ 184{
185 bool preserve;
186 struct reflink_arguments args;
187 struct inode *inode = file->f_path.dentry->d_inode;
188
184 switch (cmd) { 189 switch (cmd) {
185 case OCFS2_IOC32_GETFLAGS: 190 case OCFS2_IOC32_GETFLAGS:
186 cmd = OCFS2_IOC_GETFLAGS; 191 cmd = OCFS2_IOC_GETFLAGS;
@@ -195,8 +200,15 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
195 case OCFS2_IOC_GROUP_EXTEND: 200 case OCFS2_IOC_GROUP_EXTEND:
196 case OCFS2_IOC_GROUP_ADD: 201 case OCFS2_IOC_GROUP_ADD:
197 case OCFS2_IOC_GROUP_ADD64: 202 case OCFS2_IOC_GROUP_ADD64:
198 case OCFS2_IOC_REFLINK:
199 break; 203 break;
204 case OCFS2_IOC_REFLINK:
205 if (copy_from_user(&args, (struct reflink_arguments *)arg,
206 sizeof(args)))
207 return -EFAULT;
208 preserve = (args.preserve != 0);
209
210 return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
211 compat_ptr(args.new_path), preserve);
200 default: 212 default:
201 return -ENOIOCTLCMD; 213 return -ENOIOCTLCMD;
202 } 214 }
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index cf9a5ee30fef..0cd5323bd3f0 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -7,10 +7,10 @@
7 * 7 *
8 */ 8 */
9 9
10#ifndef OCFS2_IOCTL_H 10#ifndef OCFS2_IOCTL_PROTO_H
11#define OCFS2_IOCTL_H 11#define OCFS2_IOCTL_PROTO_H
12 12
13long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); 13long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
14long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); 14long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
15 15
16#endif /* OCFS2_IOCTL_H */ 16#endif /* OCFS2_IOCTL_PROTO_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index bf34c491ae96..9336c60e3a36 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -2034,7 +2034,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
2034 status = -ENOENT; 2034 status = -ENOENT;
2035 mlog_errno(status); 2035 mlog_errno(status);
2036 return status; 2036 return status;
2037 } 2037 }
2038 2038
2039 mutex_lock(&orphan_dir_inode->i_mutex); 2039 mutex_lock(&orphan_dir_inode->i_mutex);
2040 status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0); 2040 status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ac10f83edb95..c983715d8d8c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -476,7 +476,7 @@ out_mutex:
476 476
477out: 477out:
478 if (!status) 478 if (!status)
479 ocfs2_init_inode_steal_slot(osb); 479 ocfs2_init_steal_slots(osb);
480 mlog_exit(status); 480 mlog_exit(status);
481 return status; 481 return status;
482} 482}
@@ -872,8 +872,10 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
872 (unsigned long long)la_start_blk, 872 (unsigned long long)la_start_blk,
873 (unsigned long long)blkno); 873 (unsigned long long)blkno);
874 874
875 status = ocfs2_free_clusters(handle, main_bm_inode, 875 status = ocfs2_release_clusters(handle,
876 main_bm_bh, blkno, count); 876 main_bm_inode,
877 main_bm_bh, blkno,
878 count);
877 if (status < 0) { 879 if (status < 0) {
878 mlog_errno(status); 880 mlog_errno(status);
879 goto bail; 881 goto bail;
@@ -984,8 +986,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
984 } 986 }
985 987
986retry_enospc: 988retry_enospc:
987 (*ac)->ac_bits_wanted = osb->local_alloc_bits; 989 (*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
988
989 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 990 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
990 if (status == -ENOSPC) { 991 if (status == -ENOSPC) {
991 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == 992 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1061,6 +1062,7 @@ retry_enospc:
1061 OCFS2_LA_DISABLED) 1062 OCFS2_LA_DISABLED)
1062 goto bail; 1063 goto bail;
1063 1064
1065 ac->ac_bits_wanted = osb->local_alloc_default_bits;
1064 status = ocfs2_claim_clusters(osb, handle, ac, 1066 status = ocfs2_claim_clusters(osb, handle, ac,
1065 osb->local_alloc_bits, 1067 osb->local_alloc_bits,
1066 &cluster_off, 1068 &cluster_off,
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac6245175..b5cb3ede9408 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
133 133
134 if (!(fl->fl_flags & FL_POSIX)) 134 if (!(fl->fl_flags & FL_POSIX))
135 return -ENOLCK; 135 return -ENOLCK;
136 if (__mandatory_lock(inode)) 136 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
137 return -ENOLCK; 137 return -ENOLCK;
138 138
139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); 139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 39737613424a..7898bd3a99f5 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/uio.h> 30#include <linux/uio.h>
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 50fb26a6a5f5..b1eb50ae4097 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -84,7 +84,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
84static int ocfs2_orphan_add(struct ocfs2_super *osb, 84static int ocfs2_orphan_add(struct ocfs2_super *osb,
85 handle_t *handle, 85 handle_t *handle,
86 struct inode *inode, 86 struct inode *inode,
87 struct ocfs2_dinode *fe, 87 struct buffer_head *fe_bh,
88 char *name, 88 char *name,
89 struct ocfs2_dir_lookup_result *lookup, 89 struct ocfs2_dir_lookup_result *lookup,
90 struct inode *orphan_dir_inode); 90 struct inode *orphan_dir_inode);
@@ -212,7 +212,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
212 } else 212 } else
213 inode->i_gid = current_fsgid(); 213 inode->i_gid = current_fsgid();
214 inode->i_mode = mode; 214 inode->i_mode = mode;
215 vfs_dq_init(inode); 215 dquot_initialize(inode);
216 return inode; 216 return inode;
217} 217}
218 218
@@ -244,6 +244,8 @@ static int ocfs2_mknod(struct inode *dir,
244 (unsigned long)dev, dentry->d_name.len, 244 (unsigned long)dev, dentry->d_name.len,
245 dentry->d_name.name); 245 dentry->d_name.name);
246 246
247 dquot_initialize(dir);
248
247 /* get our super block */ 249 /* get our super block */
248 osb = OCFS2_SB(dir->i_sb); 250 osb = OCFS2_SB(dir->i_sb);
249 251
@@ -348,13 +350,9 @@ static int ocfs2_mknod(struct inode *dir,
348 goto leave; 350 goto leave;
349 } 351 }
350 352
351 /* We don't use standard VFS wrapper because we don't want vfs_dq_init 353 status = dquot_alloc_inode(inode);
352 * to be called. */ 354 if (status)
353 if (sb_any_quota_active(osb->sb) &&
354 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
355 status = -EDQUOT;
356 goto leave; 355 goto leave;
357 }
358 did_quota_inode = 1; 356 did_quota_inode = 1;
359 357
360 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, 358 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
@@ -431,7 +429,7 @@ static int ocfs2_mknod(struct inode *dir,
431 status = 0; 429 status = 0;
432leave: 430leave:
433 if (status < 0 && did_quota_inode) 431 if (status < 0 && did_quota_inode)
434 vfs_dq_free_inode(inode); 432 dquot_free_inode(inode);
435 if (handle) 433 if (handle)
436 ocfs2_commit_trans(osb, handle); 434 ocfs2_commit_trans(osb, handle);
437 435
@@ -636,6 +634,8 @@ static int ocfs2_link(struct dentry *old_dentry,
636 if (S_ISDIR(inode->i_mode)) 634 if (S_ISDIR(inode->i_mode))
637 return -EPERM; 635 return -EPERM;
638 636
637 dquot_initialize(dir);
638
639 err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT); 639 err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
640 if (err < 0) { 640 if (err < 0) {
641 if (err != -ENOENT) 641 if (err != -ENOENT)
@@ -791,6 +791,8 @@ static int ocfs2_unlink(struct inode *dir,
791 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, 791 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
792 dentry->d_name.len, dentry->d_name.name); 792 dentry->d_name.len, dentry->d_name.name);
793 793
794 dquot_initialize(dir);
795
794 BUG_ON(dentry->d_parent->d_inode != dir); 796 BUG_ON(dentry->d_parent->d_inode != dir);
795 797
796 mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); 798 mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -877,7 +879,7 @@ static int ocfs2_unlink(struct inode *dir,
877 fe = (struct ocfs2_dinode *) fe_bh->b_data; 879 fe = (struct ocfs2_dinode *) fe_bh->b_data;
878 880
879 if (inode_is_unlinkable(inode)) { 881 if (inode_is_unlinkable(inode)) {
880 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, 882 status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
881 &orphan_insert, orphan_dir); 883 &orphan_insert, orphan_dir);
882 if (status < 0) { 884 if (status < 0) {
883 mlog_errno(status); 885 mlog_errno(status);
@@ -1051,6 +1053,9 @@ static int ocfs2_rename(struct inode *old_dir,
1051 old_dentry->d_name.len, old_dentry->d_name.name, 1053 old_dentry->d_name.len, old_dentry->d_name.name,
1052 new_dentry->d_name.len, new_dentry->d_name.name); 1054 new_dentry->d_name.len, new_dentry->d_name.name);
1053 1055
1056 dquot_initialize(old_dir);
1057 dquot_initialize(new_dir);
1058
1054 osb = OCFS2_SB(old_dir->i_sb); 1059 osb = OCFS2_SB(old_dir->i_sb);
1055 1060
1056 if (new_inode) { 1061 if (new_inode) {
@@ -1295,7 +1300,7 @@ static int ocfs2_rename(struct inode *old_dir,
1295 if (S_ISDIR(new_inode->i_mode) || 1300 if (S_ISDIR(new_inode->i_mode) ||
1296 (ocfs2_read_links_count(newfe) == 1)) { 1301 (ocfs2_read_links_count(newfe) == 1)) {
1297 status = ocfs2_orphan_add(osb, handle, new_inode, 1302 status = ocfs2_orphan_add(osb, handle, new_inode,
1298 newfe, orphan_name, 1303 newfe_bh, orphan_name,
1299 &orphan_insert, orphan_dir); 1304 &orphan_insert, orphan_dir);
1300 if (status < 0) { 1305 if (status < 0) {
1301 mlog_errno(status); 1306 mlog_errno(status);
@@ -1599,6 +1604,8 @@ static int ocfs2_symlink(struct inode *dir,
1599 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1604 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1600 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1605 dentry, symname, dentry->d_name.len, dentry->d_name.name);
1601 1606
1607 dquot_initialize(dir);
1608
1602 sb = dir->i_sb; 1609 sb = dir->i_sb;
1603 osb = OCFS2_SB(sb); 1610 osb = OCFS2_SB(sb);
1604 1611
@@ -1688,13 +1695,9 @@ static int ocfs2_symlink(struct inode *dir,
1688 goto bail; 1695 goto bail;
1689 } 1696 }
1690 1697
1691 /* We don't use standard VFS wrapper because we don't want vfs_dq_init 1698 status = dquot_alloc_inode(inode);
1692 * to be called. */ 1699 if (status)
1693 if (sb_any_quota_active(osb->sb) &&
1694 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
1695 status = -EDQUOT;
1696 goto bail; 1700 goto bail;
1697 }
1698 did_quota_inode = 1; 1701 did_quota_inode = 1;
1699 1702
1700 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, 1703 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
@@ -1716,11 +1719,10 @@ static int ocfs2_symlink(struct inode *dir,
1716 u32 offset = 0; 1719 u32 offset = 0;
1717 1720
1718 inode->i_op = &ocfs2_symlink_inode_operations; 1721 inode->i_op = &ocfs2_symlink_inode_operations;
1719 if (vfs_dq_alloc_space_nodirty(inode, 1722 status = dquot_alloc_space_nodirty(inode,
1720 ocfs2_clusters_to_bytes(osb->sb, 1))) { 1723 ocfs2_clusters_to_bytes(osb->sb, 1));
1721 status = -EDQUOT; 1724 if (status)
1722 goto bail; 1725 goto bail;
1723 }
1724 did_quota = 1; 1726 did_quota = 1;
1725 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, 1727 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
1726 new_fe_bh, 1728 new_fe_bh,
@@ -1788,10 +1790,10 @@ static int ocfs2_symlink(struct inode *dir,
1788 d_instantiate(dentry, inode); 1790 d_instantiate(dentry, inode);
1789bail: 1791bail:
1790 if (status < 0 && did_quota) 1792 if (status < 0 && did_quota)
1791 vfs_dq_free_space_nodirty(inode, 1793 dquot_free_space_nodirty(inode,
1792 ocfs2_clusters_to_bytes(osb->sb, 1)); 1794 ocfs2_clusters_to_bytes(osb->sb, 1));
1793 if (status < 0 && did_quota_inode) 1795 if (status < 0 && did_quota_inode)
1794 vfs_dq_free_inode(inode); 1796 dquot_free_inode(inode);
1795 if (handle) 1797 if (handle)
1796 ocfs2_commit_trans(osb, handle); 1798 ocfs2_commit_trans(osb, handle);
1797 1799
@@ -1909,7 +1911,7 @@ leave:
1909static int ocfs2_orphan_add(struct ocfs2_super *osb, 1911static int ocfs2_orphan_add(struct ocfs2_super *osb,
1910 handle_t *handle, 1912 handle_t *handle,
1911 struct inode *inode, 1913 struct inode *inode,
1912 struct ocfs2_dinode *fe, 1914 struct buffer_head *fe_bh,
1913 char *name, 1915 char *name,
1914 struct ocfs2_dir_lookup_result *lookup, 1916 struct ocfs2_dir_lookup_result *lookup,
1915 struct inode *orphan_dir_inode) 1917 struct inode *orphan_dir_inode)
@@ -1917,6 +1919,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1917 struct buffer_head *orphan_dir_bh = NULL; 1919 struct buffer_head *orphan_dir_bh = NULL;
1918 int status = 0; 1920 int status = 0;
1919 struct ocfs2_dinode *orphan_fe; 1921 struct ocfs2_dinode *orphan_fe;
1922 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1920 1923
1921 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 1924 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
1922 1925
@@ -1957,6 +1960,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1957 goto leave; 1960 goto leave;
1958 } 1961 }
1959 1962
1963 /*
1964 * We're going to journal the change of i_flags and i_orphaned_slot.
1965 * It's safe anyway, though some callers may duplicate the journaling.
1966 * Journaling within the func just make the logic look more
1967 * straightforward.
1968 */
1969 status = ocfs2_journal_access_di(handle,
1970 INODE_CACHE(inode),
1971 fe_bh,
1972 OCFS2_JOURNAL_ACCESS_WRITE);
1973 if (status < 0) {
1974 mlog_errno(status);
1975 goto leave;
1976 }
1977
1960 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); 1978 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
1961 1979
1962 /* Record which orphan dir our inode now resides 1980 /* Record which orphan dir our inode now resides
@@ -1964,6 +1982,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1964 * dir to lock. */ 1982 * dir to lock. */
1965 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); 1983 fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
1966 1984
1985 ocfs2_journal_dirty(handle, fe_bh);
1986
1967 mlog(0, "Inode %llu orphaned in slot %d\n", 1987 mlog(0, "Inode %llu orphaned in slot %d\n",
1968 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); 1988 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
1969 1989
@@ -2099,13 +2119,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2099 goto leave; 2119 goto leave;
2100 } 2120 }
2101 2121
2102 /* We don't use standard VFS wrapper because we don't want vfs_dq_init 2122 status = dquot_alloc_inode(inode);
2103 * to be called. */ 2123 if (status)
2104 if (sb_any_quota_active(osb->sb) &&
2105 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
2106 status = -EDQUOT;
2107 goto leave; 2124 goto leave;
2108 }
2109 did_quota_inode = 1; 2125 did_quota_inode = 1;
2110 2126
2111 inode->i_nlink = 0; 2127 inode->i_nlink = 0;
@@ -2125,7 +2141,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2125 } 2141 }
2126 2142
2127 di = (struct ocfs2_dinode *)new_di_bh->b_data; 2143 di = (struct ocfs2_dinode *)new_di_bh->b_data;
2128 status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name, 2144 status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
2129 &orphan_insert, orphan_dir); 2145 &orphan_insert, orphan_dir);
2130 if (status < 0) { 2146 if (status < 0) {
2131 mlog_errno(status); 2147 mlog_errno(status);
@@ -2140,7 +2156,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2140 insert_inode_hash(inode); 2156 insert_inode_hash(inode);
2141leave: 2157leave:
2142 if (status < 0 && did_quota_inode) 2158 if (status < 0 && did_quota_inode)
2143 vfs_dq_free_inode(inode); 2159 dquot_free_inode(inode);
2144 if (handle) 2160 if (handle)
2145 ocfs2_commit_trans(osb, handle); 2161 ocfs2_commit_trans(osb, handle);
2146 2162
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 9362eea7424b..adf5e2ebc2c4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -42,6 +42,7 @@
42 42
43#include "ocfs2_fs.h" 43#include "ocfs2_fs.h"
44#include "ocfs2_lockid.h" 44#include "ocfs2_lockid.h"
45#include "ocfs2_ioctl.h"
45 46
46/* For struct ocfs2_blockcheck_stats */ 47/* For struct ocfs2_blockcheck_stats */
47#include "blockcheck.h" 48#include "blockcheck.h"
@@ -136,6 +137,10 @@ enum ocfs2_unlock_action {
136#define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a 137#define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a
137 call to dlm_lock. Only 138 call to dlm_lock. Only
138 exists with BUSY set. */ 139 exists with BUSY set. */
140#define OCFS2_LOCK_UPCONVERT_FINISHING (0x00000800) /* blocks the dc thread
141 * from downconverting
142 * before the upconvert
143 * has completed */
139 144
140struct ocfs2_lock_res_ops; 145struct ocfs2_lock_res_ops;
141 146
@@ -155,7 +160,7 @@ struct ocfs2_lock_res {
155 int l_level; 160 int l_level;
156 unsigned int l_ro_holders; 161 unsigned int l_ro_holders;
157 unsigned int l_ex_holders; 162 unsigned int l_ex_holders;
158 union ocfs2_dlm_lksb l_lksb; 163 struct ocfs2_dlm_lksb l_lksb;
159 164
160 /* used from AST/BAST funcs. */ 165 /* used from AST/BAST funcs. */
161 enum ocfs2_ast_action l_action; 166 enum ocfs2_ast_action l_action;
@@ -301,7 +306,9 @@ struct ocfs2_super
301 u32 s_next_generation; 306 u32 s_next_generation;
302 unsigned long osb_flags; 307 unsigned long osb_flags;
303 s16 s_inode_steal_slot; 308 s16 s_inode_steal_slot;
309 s16 s_meta_steal_slot;
304 atomic_t s_num_inodes_stolen; 310 atomic_t s_num_inodes_stolen;
311 atomic_t s_num_meta_stolen;
305 312
306 unsigned long s_mount_opt; 313 unsigned long s_mount_opt;
307 unsigned int s_atime_quantum; 314 unsigned int s_atime_quantum;
@@ -756,35 +763,18 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
756 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); 763 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
757} 764}
758 765
759static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) 766static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
760{ 767{
761 spin_lock(&osb->osb_lock); 768 ext2_set_bit(bit, bitmap);
762 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
763 spin_unlock(&osb->osb_lock);
764 atomic_set(&osb->s_num_inodes_stolen, 0);
765} 769}
770#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
766 771
767static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb, 772static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
768 s16 slot)
769{ 773{
770 spin_lock(&osb->osb_lock); 774 ext2_clear_bit(bit, bitmap);
771 osb->s_inode_steal_slot = slot;
772 spin_unlock(&osb->osb_lock);
773}
774
775static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
776{
777 s16 slot;
778
779 spin_lock(&osb->osb_lock);
780 slot = osb->s_inode_steal_slot;
781 spin_unlock(&osb->osb_lock);
782
783 return slot;
784} 775}
776#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
785 777
786#define ocfs2_set_bit ext2_set_bit
787#define ocfs2_clear_bit ext2_clear_bit
788#define ocfs2_test_bit ext2_test_bit 778#define ocfs2_test_bit ext2_test_bit
789#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit 779#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
790#define ocfs2_find_next_bit ext2_find_next_bit 780#define ocfs2_find_next_bit ext2_find_next_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 1a1a679e51b5..bb37218a7978 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -254,63 +254,6 @@
254 * refcount tree */ 254 * refcount tree */
255 255
256/* 256/*
257 * ioctl commands
258 */
259#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
260#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long)
261#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int)
262#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
263
264/*
265 * Space reservation / allocation / free ioctls and argument structure
266 * are designed to be compatible with XFS.
267 *
268 * ALLOCSP* and FREESP* are not and will never be supported, but are
269 * included here for completeness.
270 */
271struct ocfs2_space_resv {
272 __s16 l_type;
273 __s16 l_whence;
274 __s64 l_start;
275 __s64 l_len; /* len == 0 means until end of file */
276 __s32 l_sysid;
277 __u32 l_pid;
278 __s32 l_pad[4]; /* reserve area */
279};
280
281#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
282#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
283#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
284#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
285#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
286#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
287#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
288#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
289
290/* Used to pass group descriptor data when online resize is done */
291struct ocfs2_new_group_input {
292 __u64 group; /* Group descriptor's blkno. */
293 __u32 clusters; /* Total number of clusters in this group */
294 __u32 frees; /* Total free clusters in this group */
295 __u16 chain; /* Chain for this group */
296 __u16 reserved1;
297 __u32 reserved2;
298};
299
300#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
301#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
302#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
303
304/* Used to pass 2 file names to reflink. */
305struct reflink_arguments {
306 __u64 old_path;
307 __u64 new_path;
308 __u64 preserve;
309};
310#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
311
312
313/*
314 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 257 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
315 */ 258 */
316#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ 259#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
@@ -1417,9 +1360,16 @@ static inline int ocfs2_fast_symlink_chars(int blocksize)
1417 return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); 1360 return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
1418} 1361}
1419 1362
1420static inline int ocfs2_max_inline_data(int blocksize) 1363static inline int ocfs2_max_inline_data_with_xattr(int blocksize,
1364 struct ocfs2_dinode *di)
1421{ 1365{
1422 return blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data); 1366 if (di && (di->i_dyn_features & OCFS2_INLINE_XATTR_FL))
1367 return blocksize -
1368 offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
1369 di->i_xattr_inline_size;
1370 else
1371 return blocksize -
1372 offsetof(struct ocfs2_dinode, id2.i_data.id_data);
1423} 1373}
1424 1374
1425static inline int ocfs2_extent_recs_per_inode(int blocksize) 1375static inline int ocfs2_extent_recs_per_inode(int blocksize)
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
new file mode 100644
index 000000000000..2d3420af1a83
--- /dev/null
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -0,0 +1,79 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_ioctl.h
5 *
6 * Defines OCFS2 ioctls.
7 *
8 * Copyright (C) 2010 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_IOCTL_H
21#define OCFS2_IOCTL_H
22
23/*
24 * ioctl commands
25 */
26#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
27#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long)
28#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int)
29#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
30
31/*
32 * Space reservation / allocation / free ioctls and argument structure
33 * are designed to be compatible with XFS.
34 *
35 * ALLOCSP* and FREESP* are not and will never be supported, but are
36 * included here for completeness.
37 */
38struct ocfs2_space_resv {
39 __s16 l_type;
40 __s16 l_whence;
41 __s64 l_start;
42 __s64 l_len; /* len == 0 means until end of file */
43 __s32 l_sysid;
44 __u32 l_pid;
45 __s32 l_pad[4]; /* reserve area */
46};
47
48#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
49#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
50#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
51#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
52#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
53#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
54#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
55#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
56
57/* Used to pass group descriptor data when online resize is done */
58struct ocfs2_new_group_input {
59 __u64 group; /* Group descriptor's blkno. */
60 __u32 clusters; /* Total number of clusters in this group */
61 __u32 frees; /* Total free clusters in this group */
62 __u16 chain; /* Chain for this group */
63 __u16 reserved1;
64 __u32 reserved2;
65};
66
67#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
68#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
69#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
70
71/* Used to pass 2 file names to reflink. */
72struct reflink_arguments {
73 __u64 old_path;
74 __u64 new_path;
75 __u64 preserve;
76};
77#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
78
79#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h
index 82d5eeac0fff..2e45c8d2ea7e 100644
--- a/fs/ocfs2/ocfs2_lockingver.h
+++ b/fs/ocfs2/ocfs2_lockingver.h
@@ -23,6 +23,8 @@
23/* 23/*
24 * The protocol version for ocfs2 cluster locking. See dlmglue.c for 24 * The protocol version for ocfs2 cluster locking. See dlmglue.c for
25 * more details. 25 * more details.
26 *
27 * 1.0 - Initial locking version from ocfs2 1.4.
26 */ 28 */
27#define OCFS2_LOCKING_PROTOCOL_MAJOR 1 29#define OCFS2_LOCKING_PROTOCOL_MAJOR 1
28#define OCFS2_LOCKING_PROTOCOL_MINOR 0 30#define OCFS2_LOCKING_PROTOCOL_MINOR 0
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index b437dc0c4cad..ab42a74c7539 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5#include <linux/fs.h> 5#include <linux/fs.h>
6#include <linux/slab.h>
6#include <linux/quota.h> 7#include <linux/quota.h>
7#include <linux/quotaops.h> 8#include <linux/quotaops.h>
8#include <linux/dqblk_qtree.h> 9#include <linux/dqblk_qtree.h>
@@ -851,13 +852,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
851} 852}
852 853
853const struct dquot_operations ocfs2_quota_operations = { 854const struct dquot_operations ocfs2_quota_operations = {
854 .initialize = dquot_initialize,
855 .drop = dquot_drop,
856 .alloc_space = dquot_alloc_space,
857 .alloc_inode = dquot_alloc_inode,
858 .free_space = dquot_free_space,
859 .free_inode = dquot_free_inode,
860 .transfer = dquot_transfer,
861 .write_dquot = ocfs2_write_dquot, 855 .write_dquot = ocfs2_write_dquot,
862 .acquire_dquot = ocfs2_acquire_dquot, 856 .acquire_dquot = ocfs2_acquire_dquot,
863 .release_dquot = ocfs2_release_dquot, 857 .release_dquot = ocfs2_release_dquot,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 21f9e71223ca..9ad49305f450 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -3,6 +3,7 @@
3 */ 3 */
4 4
5#include <linux/fs.h> 5#include <linux/fs.h>
6#include <linux/slab.h>
6#include <linux/quota.h> 7#include <linux/quota.h>
7#include <linux/quotaops.h> 8#include <linux/quotaops.h>
8#include <linux/module.h> 9#include <linux/module.h>
@@ -457,7 +458,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
457 break; 458 break;
458 } 459 }
459 dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data; 460 dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
460 for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) { 461 for_each_set_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
461 qbh = NULL; 462 qbh = NULL;
462 status = ocfs2_read_quota_block(lqinode, 463 status = ocfs2_read_quota_block(lqinode,
463 ol_dqblk_block(sb, chunk, bit), 464 ol_dqblk_block(sb, chunk, bit),
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 74db2be75dd6..bd96f6c7877e 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -37,7 +37,6 @@
37 37
38#include <linux/bio.h> 38#include <linux/bio.h>
39#include <linux/blkdev.h> 39#include <linux/blkdev.h>
40#include <linux/gfp.h>
41#include <linux/slab.h> 40#include <linux/slab.h>
42#include <linux/writeback.h> 41#include <linux/writeback.h>
43#include <linux/pagevec.h> 42#include <linux/pagevec.h>
@@ -626,7 +625,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
626 rb = (struct ocfs2_refcount_block *)new_bh->b_data; 625 rb = (struct ocfs2_refcount_block *)new_bh->b_data;
627 memset(rb, 0, inode->i_sb->s_blocksize); 626 memset(rb, 0, inode->i_sb->s_blocksize);
628 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 627 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
629 rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num); 628 rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
630 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 629 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
631 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); 630 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
632 rb->rf_blkno = cpu_to_le64(first_blkno); 631 rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -1330,7 +1329,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1330 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize); 1329 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
1331 1330
1332 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1331 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1333 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1332 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1334 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1333 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1335 new_rb->rf_blkno = cpu_to_le64(blkno); 1334 new_rb->rf_blkno = cpu_to_le64(blkno);
1336 new_rb->rf_cpos = cpu_to_le32(0); 1335 new_rb->rf_cpos = cpu_to_le32(0);
@@ -1576,7 +1575,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1576 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1575 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1577 memset(new_rb, 0, sb->s_blocksize); 1576 memset(new_rb, 0, sb->s_blocksize);
1578 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 1577 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1579 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1578 new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1580 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1579 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1581 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 1580 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1582 new_rb->rf_blkno = cpu_to_le64(blkno); 1581 new_rb->rf_blkno = cpu_to_le64(blkno);
@@ -2945,7 +2944,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2945 2944
2946 while (offset < end) { 2945 while (offset < end) {
2947 page_index = offset >> PAGE_CACHE_SHIFT; 2946 page_index = offset >> PAGE_CACHE_SHIFT;
2948 map_end = (page_index + 1) << PAGE_CACHE_SHIFT; 2947 map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
2949 if (map_end > end) 2948 if (map_end > end)
2950 map_end = end; 2949 map_end = end;
2951 2950
@@ -2957,8 +2956,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2957 2956
2958 page = grab_cache_page(mapping, page_index); 2957 page = grab_cache_page(mapping, page_index);
2959 2958
2960 /* This page can't be dirtied before we CoW it out. */ 2959 /*
2961 BUG_ON(PageDirty(page)); 2960 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
2961 * can't be dirtied before we CoW it out.
2962 */
2963 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2964 BUG_ON(PageDirty(page));
2962 2965
2963 if (!PageUptodate(page)) { 2966 if (!PageUptodate(page)) {
2964 ret = block_read_full_page(page, ocfs2_get_block); 2967 ret = block_read_full_page(page, ocfs2_get_block);
@@ -3170,7 +3173,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
3170 3173
3171 while (offset < end) { 3174 while (offset < end) {
3172 page_index = offset >> PAGE_CACHE_SHIFT; 3175 page_index = offset >> PAGE_CACHE_SHIFT;
3173 map_end = (page_index + 1) << PAGE_CACHE_SHIFT; 3176 map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
3174 if (map_end > end) 3177 if (map_end > end)
3175 map_end = end; 3178 map_end = end;
3176 3179
@@ -4071,6 +4074,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
4071 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; 4074 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
4072 spin_unlock(&OCFS2_I(t_inode)->ip_lock); 4075 spin_unlock(&OCFS2_I(t_inode)->ip_lock);
4073 i_size_write(t_inode, size); 4076 i_size_write(t_inode, size);
4077 t_inode->i_blocks = s_inode->i_blocks;
4074 4078
4075 di->i_xattr_inline_size = s_di->i_xattr_inline_size; 4079 di->i_xattr_inline_size = s_di->i_xattr_inline_size;
4076 di->i_clusters = s_di->i_clusters; 4080 di->i_clusters = s_di->i_clusters;
@@ -4386,7 +4390,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
4386 } 4390 }
4387 4391
4388 mutex_lock(&inode->i_mutex); 4392 mutex_lock(&inode->i_mutex);
4389 vfs_dq_init(dir); 4393 dquot_initialize(dir);
4390 error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); 4394 error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
4391 mutex_unlock(&inode->i_mutex); 4395 mutex_unlock(&inode->i_mutex);
4392 if (!error) 4396 if (!error)
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index e49c41050264..0d3049f696c5 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -19,6 +19,7 @@
19 19
20#include <linux/kernel.h> 20#include <linux/kernel.h>
21#include <linux/crc32.h> 21#include <linux/crc32.h>
22#include <linux/slab.h>
22#include <linux/module.h> 23#include <linux/module.h>
23 24
24/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */ 25/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
@@ -161,24 +162,23 @@ static int dlm_status_to_errno(enum dlm_status status)
161 162
162static void o2dlm_lock_ast_wrapper(void *astarg) 163static void o2dlm_lock_ast_wrapper(void *astarg)
163{ 164{
164 BUG_ON(o2cb_stack.sp_proto == NULL); 165 struct ocfs2_dlm_lksb *lksb = astarg;
165 166
166 o2cb_stack.sp_proto->lp_lock_ast(astarg); 167 lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
167} 168}
168 169
169static void o2dlm_blocking_ast_wrapper(void *astarg, int level) 170static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
170{ 171{
171 BUG_ON(o2cb_stack.sp_proto == NULL); 172 struct ocfs2_dlm_lksb *lksb = astarg;
172 173
173 o2cb_stack.sp_proto->lp_blocking_ast(astarg, level); 174 lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
174} 175}
175 176
176static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) 177static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
177{ 178{
179 struct ocfs2_dlm_lksb *lksb = astarg;
178 int error = dlm_status_to_errno(status); 180 int error = dlm_status_to_errno(status);
179 181
180 BUG_ON(o2cb_stack.sp_proto == NULL);
181
182 /* 182 /*
183 * In o2dlm, you can get both the lock_ast() for the lock being 183 * In o2dlm, you can get both the lock_ast() for the lock being
184 * granted and the unlock_ast() for the CANCEL failing. A 184 * granted and the unlock_ast() for the CANCEL failing. A
@@ -193,16 +193,15 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
193 if (status == DLM_CANCELGRANT) 193 if (status == DLM_CANCELGRANT)
194 return; 194 return;
195 195
196 o2cb_stack.sp_proto->lp_unlock_ast(astarg, error); 196 lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error);
197} 197}
198 198
199static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, 199static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
200 int mode, 200 int mode,
201 union ocfs2_dlm_lksb *lksb, 201 struct ocfs2_dlm_lksb *lksb,
202 u32 flags, 202 u32 flags,
203 void *name, 203 void *name,
204 unsigned int namelen, 204 unsigned int namelen)
205 void *astarg)
206{ 205{
207 enum dlm_status status; 206 enum dlm_status status;
208 int o2dlm_mode = mode_to_o2dlm(mode); 207 int o2dlm_mode = mode_to_o2dlm(mode);
@@ -211,28 +210,27 @@ static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
211 210
212 status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm, 211 status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
213 o2dlm_flags, name, namelen, 212 o2dlm_flags, name, namelen,
214 o2dlm_lock_ast_wrapper, astarg, 213 o2dlm_lock_ast_wrapper, lksb,
215 o2dlm_blocking_ast_wrapper); 214 o2dlm_blocking_ast_wrapper);
216 ret = dlm_status_to_errno(status); 215 ret = dlm_status_to_errno(status);
217 return ret; 216 return ret;
218} 217}
219 218
220static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn, 219static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
221 union ocfs2_dlm_lksb *lksb, 220 struct ocfs2_dlm_lksb *lksb,
222 u32 flags, 221 u32 flags)
223 void *astarg)
224{ 222{
225 enum dlm_status status; 223 enum dlm_status status;
226 int o2dlm_flags = flags_to_o2dlm(flags); 224 int o2dlm_flags = flags_to_o2dlm(flags);
227 int ret; 225 int ret;
228 226
229 status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm, 227 status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
230 o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg); 228 o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb);
231 ret = dlm_status_to_errno(status); 229 ret = dlm_status_to_errno(status);
232 return ret; 230 return ret;
233} 231}
234 232
235static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 233static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
236{ 234{
237 return dlm_status_to_errno(lksb->lksb_o2dlm.status); 235 return dlm_status_to_errno(lksb->lksb_o2dlm.status);
238} 236}
@@ -242,17 +240,17 @@ static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
242 * contents, it will zero out the LVB. Thus the caller can always trust 240 * contents, it will zero out the LVB. Thus the caller can always trust
243 * the contents. 241 * the contents.
244 */ 242 */
245static int o2cb_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 243static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
246{ 244{
247 return 1; 245 return 1;
248} 246}
249 247
250static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb) 248static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
251{ 249{
252 return (void *)(lksb->lksb_o2dlm.lvb); 250 return (void *)(lksb->lksb_o2dlm.lvb);
253} 251}
254 252
255static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb) 253static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
256{ 254{
257 dlm_print_one_lock(lksb->lksb_o2dlm.lockid); 255 dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
258} 256}
@@ -277,10 +275,10 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
277 u32 dlm_key; 275 u32 dlm_key;
278 struct dlm_ctxt *dlm; 276 struct dlm_ctxt *dlm;
279 struct o2dlm_private *priv; 277 struct o2dlm_private *priv;
280 struct dlm_protocol_version dlm_version; 278 struct dlm_protocol_version fs_version;
281 279
282 BUG_ON(conn == NULL); 280 BUG_ON(conn == NULL);
283 BUG_ON(o2cb_stack.sp_proto == NULL); 281 BUG_ON(conn->cc_proto == NULL);
284 282
285 /* for now we only have one cluster/node, make sure we see it 283 /* for now we only have one cluster/node, make sure we see it
286 * in the heartbeat universe */ 284 * in the heartbeat universe */
@@ -304,18 +302,18 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
304 /* used by the dlm code to make message headers unique, each 302 /* used by the dlm code to make message headers unique, each
305 * node in this domain must agree on this. */ 303 * node in this domain must agree on this. */
306 dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen); 304 dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
307 dlm_version.pv_major = conn->cc_version.pv_major; 305 fs_version.pv_major = conn->cc_version.pv_major;
308 dlm_version.pv_minor = conn->cc_version.pv_minor; 306 fs_version.pv_minor = conn->cc_version.pv_minor;
309 307
310 dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version); 308 dlm = dlm_register_domain(conn->cc_name, dlm_key, &fs_version);
311 if (IS_ERR(dlm)) { 309 if (IS_ERR(dlm)) {
312 rc = PTR_ERR(dlm); 310 rc = PTR_ERR(dlm);
313 mlog_errno(rc); 311 mlog_errno(rc);
314 goto out_free; 312 goto out_free;
315 } 313 }
316 314
317 conn->cc_version.pv_major = dlm_version.pv_major; 315 conn->cc_version.pv_major = fs_version.pv_major;
318 conn->cc_version.pv_minor = dlm_version.pv_minor; 316 conn->cc_version.pv_minor = fs_version.pv_minor;
319 conn->cc_lockspace = dlm; 317 conn->cc_lockspace = dlm;
320 318
321 dlm_register_eviction_cb(dlm, &priv->op_eviction_cb); 319 dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index da78a2a334fd..2dc57bca0688 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,11 +21,11 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/miscdevice.h> 22#include <linux/miscdevice.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h>
24#include <linux/smp_lock.h> 25#include <linux/smp_lock.h>
25#include <linux/reboot.h> 26#include <linux/reboot.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27 28
28#include "ocfs2.h" /* For struct ocfs2_lock_res */
29#include "stackglue.h" 29#include "stackglue.h"
30 30
31#include <linux/dlm_plock.h> 31#include <linux/dlm_plock.h>
@@ -63,8 +63,8 @@
63 * negotiated by the client. The client negotiates based on the maximum 63 * negotiated by the client. The client negotiates based on the maximum
64 * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major 64 * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major
65 * number from the "SETV" message must match 65 * number from the "SETV" message must match
66 * ocfs2_user_plugin.sp_proto->lp_max_version.pv_major, and the minor number 66 * ocfs2_user_plugin.sp_max_proto.pv_major, and the minor number
67 * must be less than or equal to ...->lp_max_version.pv_minor. 67 * must be less than or equal to ...sp_max_version.pv_minor.
68 * 68 *
69 * Once this information has been set, mounts will be allowed. From this 69 * Once this information has been set, mounts will be allowed. From this
70 * point on, the "DOWN" message can be sent for node down notification. 70 * point on, the "DOWN" message can be sent for node down notification.
@@ -401,7 +401,7 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
401 char *ptr = NULL; 401 char *ptr = NULL;
402 struct ocfs2_control_private *p = file->private_data; 402 struct ocfs2_control_private *p = file->private_data;
403 struct ocfs2_protocol_version *max = 403 struct ocfs2_protocol_version *max =
404 &ocfs2_user_plugin.sp_proto->lp_max_version; 404 &ocfs2_user_plugin.sp_max_proto;
405 405
406 if (ocfs2_control_get_handshake_state(file) != 406 if (ocfs2_control_get_handshake_state(file) !=
407 OCFS2_CONTROL_HANDSHAKE_PROTOCOL) 407 OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
@@ -664,18 +664,10 @@ static void ocfs2_control_exit(void)
664 -rc); 664 -rc);
665} 665}
666 666
667static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
668{
669 struct ocfs2_lock_res *res = astarg;
670 return &res->l_lksb.lksb_fsdlm;
671}
672
673static void fsdlm_lock_ast_wrapper(void *astarg) 667static void fsdlm_lock_ast_wrapper(void *astarg)
674{ 668{
675 struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg); 669 struct ocfs2_dlm_lksb *lksb = astarg;
676 int status = lksb->sb_status; 670 int status = lksb->lksb_fsdlm.sb_status;
677
678 BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
679 671
680 /* 672 /*
681 * For now we're punting on the issue of other non-standard errors 673 * For now we're punting on the issue of other non-standard errors
@@ -688,25 +680,24 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
688 */ 680 */
689 681
690 if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL) 682 if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
691 ocfs2_user_plugin.sp_proto->lp_unlock_ast(astarg, 0); 683 lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, 0);
692 else 684 else
693 ocfs2_user_plugin.sp_proto->lp_lock_ast(astarg); 685 lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
694} 686}
695 687
696static void fsdlm_blocking_ast_wrapper(void *astarg, int level) 688static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
697{ 689{
698 BUG_ON(ocfs2_user_plugin.sp_proto == NULL); 690 struct ocfs2_dlm_lksb *lksb = astarg;
699 691
700 ocfs2_user_plugin.sp_proto->lp_blocking_ast(astarg, level); 692 lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
701} 693}
702 694
703static int user_dlm_lock(struct ocfs2_cluster_connection *conn, 695static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
704 int mode, 696 int mode,
705 union ocfs2_dlm_lksb *lksb, 697 struct ocfs2_dlm_lksb *lksb,
706 u32 flags, 698 u32 flags,
707 void *name, 699 void *name,
708 unsigned int namelen, 700 unsigned int namelen)
709 void *astarg)
710{ 701{
711 int ret; 702 int ret;
712 703
@@ -716,36 +707,35 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
716 707
717 ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, 708 ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
718 flags|DLM_LKF_NODLCKWT, name, namelen, 0, 709 flags|DLM_LKF_NODLCKWT, name, namelen, 0,
719 fsdlm_lock_ast_wrapper, astarg, 710 fsdlm_lock_ast_wrapper, lksb,
720 fsdlm_blocking_ast_wrapper); 711 fsdlm_blocking_ast_wrapper);
721 return ret; 712 return ret;
722} 713}
723 714
724static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, 715static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
725 union ocfs2_dlm_lksb *lksb, 716 struct ocfs2_dlm_lksb *lksb,
726 u32 flags, 717 u32 flags)
727 void *astarg)
728{ 718{
729 int ret; 719 int ret;
730 720
731 ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, 721 ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
732 flags, &lksb->lksb_fsdlm, astarg); 722 flags, &lksb->lksb_fsdlm, lksb);
733 return ret; 723 return ret;
734} 724}
735 725
736static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 726static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
737{ 727{
738 return lksb->lksb_fsdlm.sb_status; 728 return lksb->lksb_fsdlm.sb_status;
739} 729}
740 730
741static int user_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 731static int user_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
742{ 732{
743 int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID; 733 int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID;
744 734
745 return !invalid; 735 return !invalid;
746} 736}
747 737
748static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb) 738static void *user_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
749{ 739{
750 if (!lksb->lksb_fsdlm.sb_lvbptr) 740 if (!lksb->lksb_fsdlm.sb_lvbptr)
751 lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + 741 lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
@@ -753,7 +743,7 @@ static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
753 return (void *)(lksb->lksb_fsdlm.sb_lvbptr); 743 return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
754} 744}
755 745
756static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) 746static void user_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
757{ 747{
758} 748}
759 749
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index f3df0baa9a48..39abf89697ed 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -36,7 +36,7 @@
36#define OCFS2_STACK_PLUGIN_USER "user" 36#define OCFS2_STACK_PLUGIN_USER "user"
37#define OCFS2_MAX_HB_CTL_PATH 256 37#define OCFS2_MAX_HB_CTL_PATH 256
38 38
39static struct ocfs2_locking_protocol *lproto; 39static struct ocfs2_protocol_version locking_max_version;
40static DEFINE_SPINLOCK(ocfs2_stack_lock); 40static DEFINE_SPINLOCK(ocfs2_stack_lock);
41static LIST_HEAD(ocfs2_stack_list); 41static LIST_HEAD(ocfs2_stack_list);
42static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1]; 42static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
@@ -176,7 +176,7 @@ int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
176 spin_lock(&ocfs2_stack_lock); 176 spin_lock(&ocfs2_stack_lock);
177 if (!ocfs2_stack_lookup(plugin->sp_name)) { 177 if (!ocfs2_stack_lookup(plugin->sp_name)) {
178 plugin->sp_count = 0; 178 plugin->sp_count = 0;
179 plugin->sp_proto = lproto; 179 plugin->sp_max_proto = locking_max_version;
180 list_add(&plugin->sp_list, &ocfs2_stack_list); 180 list_add(&plugin->sp_list, &ocfs2_stack_list);
181 printk(KERN_INFO "ocfs2: Registered cluster interface %s\n", 181 printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
182 plugin->sp_name); 182 plugin->sp_name);
@@ -213,77 +213,76 @@ void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
213} 213}
214EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister); 214EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
215 215
216void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto) 216void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto)
217{ 217{
218 struct ocfs2_stack_plugin *p; 218 struct ocfs2_stack_plugin *p;
219 219
220 BUG_ON(proto == NULL);
221
222 spin_lock(&ocfs2_stack_lock); 220 spin_lock(&ocfs2_stack_lock);
223 BUG_ON(active_stack != NULL); 221 if (memcmp(max_proto, &locking_max_version,
222 sizeof(struct ocfs2_protocol_version))) {
223 BUG_ON(locking_max_version.pv_major != 0);
224 224
225 lproto = proto; 225 locking_max_version = *max_proto;
226 list_for_each_entry(p, &ocfs2_stack_list, sp_list) { 226 list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
227 p->sp_proto = lproto; 227 p->sp_max_proto = locking_max_version;
228 }
228 } 229 }
229
230 spin_unlock(&ocfs2_stack_lock); 230 spin_unlock(&ocfs2_stack_lock);
231} 231}
232EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol); 232EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version);
233 233
234 234
235/* 235/*
236 * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take 236 * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument
237 * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the 237 * for the ast and bast functions. They will pass the lksb to the ast
238 * underlying stack plugins need to pilfer the lksb off of the lock_res. 238 * and bast. The caller can wrap the lksb with their own structure to
239 * If some other structure needs to be passed as an astarg, the plugins 239 * get more information.
240 * will need to be given a different avenue to the lksb.
241 */ 240 */
242int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, 241int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
243 int mode, 242 int mode,
244 union ocfs2_dlm_lksb *lksb, 243 struct ocfs2_dlm_lksb *lksb,
245 u32 flags, 244 u32 flags,
246 void *name, 245 void *name,
247 unsigned int namelen, 246 unsigned int namelen)
248 struct ocfs2_lock_res *astarg)
249{ 247{
250 BUG_ON(lproto == NULL); 248 if (!lksb->lksb_conn)
251 249 lksb->lksb_conn = conn;
250 else
251 BUG_ON(lksb->lksb_conn != conn);
252 return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags, 252 return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
253 name, namelen, astarg); 253 name, namelen);
254} 254}
255EXPORT_SYMBOL_GPL(ocfs2_dlm_lock); 255EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
256 256
257int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, 257int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
258 union ocfs2_dlm_lksb *lksb, 258 struct ocfs2_dlm_lksb *lksb,
259 u32 flags, 259 u32 flags)
260 struct ocfs2_lock_res *astarg)
261{ 260{
262 BUG_ON(lproto == NULL); 261 BUG_ON(lksb->lksb_conn == NULL);
263 262
264 return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg); 263 return active_stack->sp_ops->dlm_unlock(conn, lksb, flags);
265} 264}
266EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock); 265EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
267 266
268int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb) 267int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
269{ 268{
270 return active_stack->sp_ops->lock_status(lksb); 269 return active_stack->sp_ops->lock_status(lksb);
271} 270}
272EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status); 271EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
273 272
274int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb) 273int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
275{ 274{
276 return active_stack->sp_ops->lvb_valid(lksb); 275 return active_stack->sp_ops->lvb_valid(lksb);
277} 276}
278EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid); 277EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid);
279 278
280void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb) 279void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
281{ 280{
282 return active_stack->sp_ops->lock_lvb(lksb); 281 return active_stack->sp_ops->lock_lvb(lksb);
283} 282}
284EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb); 283EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
285 284
286void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) 285void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
287{ 286{
288 active_stack->sp_ops->dump_lksb(lksb); 287 active_stack->sp_ops->dump_lksb(lksb);
289} 288}
@@ -312,6 +311,7 @@ EXPORT_SYMBOL_GPL(ocfs2_plock);
312int ocfs2_cluster_connect(const char *stack_name, 311int ocfs2_cluster_connect(const char *stack_name,
313 const char *group, 312 const char *group,
314 int grouplen, 313 int grouplen,
314 struct ocfs2_locking_protocol *lproto,
315 void (*recovery_handler)(int node_num, 315 void (*recovery_handler)(int node_num,
316 void *recovery_data), 316 void *recovery_data),
317 void *recovery_data, 317 void *recovery_data,
@@ -329,6 +329,12 @@ int ocfs2_cluster_connect(const char *stack_name,
329 goto out; 329 goto out;
330 } 330 }
331 331
332 if (memcmp(&lproto->lp_max_version, &locking_max_version,
333 sizeof(struct ocfs2_protocol_version))) {
334 rc = -EINVAL;
335 goto out;
336 }
337
332 new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection), 338 new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
333 GFP_KERNEL); 339 GFP_KERNEL);
334 if (!new_conn) { 340 if (!new_conn) {
@@ -341,6 +347,7 @@ int ocfs2_cluster_connect(const char *stack_name,
341 new_conn->cc_recovery_handler = recovery_handler; 347 new_conn->cc_recovery_handler = recovery_handler;
342 new_conn->cc_recovery_data = recovery_data; 348 new_conn->cc_recovery_data = recovery_data;
343 349
350 new_conn->cc_proto = lproto;
344 /* Start the new connection at our maximum compatibility level */ 351 /* Start the new connection at our maximum compatibility level */
345 new_conn->cc_version = lproto->lp_max_version; 352 new_conn->cc_version = lproto->lp_max_version;
346 353
@@ -366,6 +373,24 @@ out:
366} 373}
367EXPORT_SYMBOL_GPL(ocfs2_cluster_connect); 374EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
368 375
376/* The caller will ensure all nodes have the same cluster stack */
377int ocfs2_cluster_connect_agnostic(const char *group,
378 int grouplen,
379 struct ocfs2_locking_protocol *lproto,
380 void (*recovery_handler)(int node_num,
381 void *recovery_data),
382 void *recovery_data,
383 struct ocfs2_cluster_connection **conn)
384{
385 char *stack_name = NULL;
386
387 if (cluster_stack_name[0])
388 stack_name = cluster_stack_name;
389 return ocfs2_cluster_connect(stack_name, group, grouplen, lproto,
390 recovery_handler, recovery_data, conn);
391}
392EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
393
369/* If hangup_pending is 0, the stack driver will be dropped */ 394/* If hangup_pending is 0, the stack driver will be dropped */
370int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, 395int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
371 int hangup_pending) 396 int hangup_pending)
@@ -453,10 +478,10 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
453 ssize_t ret = 0; 478 ssize_t ret = 0;
454 479
455 spin_lock(&ocfs2_stack_lock); 480 spin_lock(&ocfs2_stack_lock);
456 if (lproto) 481 if (locking_max_version.pv_major)
457 ret = snprintf(buf, PAGE_SIZE, "%u.%u\n", 482 ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
458 lproto->lp_max_version.pv_major, 483 locking_max_version.pv_major,
459 lproto->lp_max_version.pv_minor); 484 locking_max_version.pv_minor);
460 spin_unlock(&ocfs2_stack_lock); 485 spin_unlock(&ocfs2_stack_lock);
461 486
462 return ret; 487 return ret;
@@ -685,7 +710,10 @@ static int __init ocfs2_stack_glue_init(void)
685 710
686static void __exit ocfs2_stack_glue_exit(void) 711static void __exit ocfs2_stack_glue_exit(void)
687{ 712{
688 lproto = NULL; 713 memset(&locking_max_version, 0,
714 sizeof(struct ocfs2_protocol_version));
715 locking_max_version.pv_major = 0;
716 locking_max_version.pv_minor = 0;
689 ocfs2_sysfs_exit(); 717 ocfs2_sysfs_exit();
690 if (ocfs2_table_header) 718 if (ocfs2_table_header)
691 unregister_sysctl_table(ocfs2_table_header); 719 unregister_sysctl_table(ocfs2_table_header);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 03a44d60eac9..8ce7398ae1d2 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -56,17 +56,6 @@ struct ocfs2_protocol_version {
56}; 56};
57 57
58/* 58/*
59 * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
60 */
61struct ocfs2_locking_protocol {
62 struct ocfs2_protocol_version lp_max_version;
63 void (*lp_lock_ast)(void *astarg);
64 void (*lp_blocking_ast)(void *astarg, int level);
65 void (*lp_unlock_ast)(void *astarg, int error);
66};
67
68
69/*
70 * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only 59 * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
71 * has a pointer to separately allocated lvb space. This struct exists only to 60 * has a pointer to separately allocated lvb space. This struct exists only to
72 * include in the lksb union to make space for a combined dlm_lksb and lvb. 61 * include in the lksb union to make space for a combined dlm_lksb and lvb.
@@ -81,12 +70,27 @@ struct fsdlm_lksb_plus_lvb {
81 * size of the union is known. Lock status structures are embedded in 70 * size of the union is known. Lock status structures are embedded in
82 * ocfs2 inodes. 71 * ocfs2 inodes.
83 */ 72 */
84union ocfs2_dlm_lksb { 73struct ocfs2_cluster_connection;
85 struct dlm_lockstatus lksb_o2dlm; 74struct ocfs2_dlm_lksb {
86 struct dlm_lksb lksb_fsdlm; 75 union {
87 struct fsdlm_lksb_plus_lvb padding; 76 struct dlm_lockstatus lksb_o2dlm;
77 struct dlm_lksb lksb_fsdlm;
78 struct fsdlm_lksb_plus_lvb padding;
79 };
80 struct ocfs2_cluster_connection *lksb_conn;
81};
82
83/*
84 * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
85 */
86struct ocfs2_locking_protocol {
87 struct ocfs2_protocol_version lp_max_version;
88 void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb);
89 void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level);
90 void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error);
88}; 91};
89 92
93
90/* 94/*
91 * A cluster connection. Mostly opaque to ocfs2, the connection holds 95 * A cluster connection. Mostly opaque to ocfs2, the connection holds
92 * state for the underlying stack. ocfs2 does use cc_version to determine 96 * state for the underlying stack. ocfs2 does use cc_version to determine
@@ -96,6 +100,7 @@ struct ocfs2_cluster_connection {
96 char cc_name[GROUP_NAME_MAX]; 100 char cc_name[GROUP_NAME_MAX];
97 int cc_namelen; 101 int cc_namelen;
98 struct ocfs2_protocol_version cc_version; 102 struct ocfs2_protocol_version cc_version;
103 struct ocfs2_locking_protocol *cc_proto;
99 void (*cc_recovery_handler)(int node_num, void *recovery_data); 104 void (*cc_recovery_handler)(int node_num, void *recovery_data);
100 void *cc_recovery_data; 105 void *cc_recovery_data;
101 void *cc_lockspace; 106 void *cc_lockspace;
@@ -155,27 +160,29 @@ struct ocfs2_stack_operations {
155 * 160 *
156 * ast and bast functions are not part of the call because the 161 * ast and bast functions are not part of the call because the
157 * stack will likely want to wrap ast and bast calls before passing 162 * stack will likely want to wrap ast and bast calls before passing
158 * them to stack->sp_proto. 163 * them to stack->sp_proto. There is no astarg. The lksb will
164 * be passed back to the ast and bast functions. The caller can
165 * use this to find their object.
159 */ 166 */
160 int (*dlm_lock)(struct ocfs2_cluster_connection *conn, 167 int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
161 int mode, 168 int mode,
162 union ocfs2_dlm_lksb *lksb, 169 struct ocfs2_dlm_lksb *lksb,
163 u32 flags, 170 u32 flags,
164 void *name, 171 void *name,
165 unsigned int namelen, 172 unsigned int namelen);
166 void *astarg);
167 173
168 /* 174 /*
169 * Call the underlying dlm unlock function. The ->dlm_unlock() 175 * Call the underlying dlm unlock function. The ->dlm_unlock()
170 * function should convert the flags as appropriate. 176 * function should convert the flags as appropriate.
171 * 177 *
172 * The unlock ast is not passed, as the stack will want to wrap 178 * The unlock ast is not passed, as the stack will want to wrap
173 * it before calling stack->sp_proto->lp_unlock_ast(). 179 * it before calling stack->sp_proto->lp_unlock_ast(). There is
180 * no astarg. The lksb will be passed back to the unlock ast
181 * function. The caller can use this to find their object.
174 */ 182 */
175 int (*dlm_unlock)(struct ocfs2_cluster_connection *conn, 183 int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
176 union ocfs2_dlm_lksb *lksb, 184 struct ocfs2_dlm_lksb *lksb,
177 u32 flags, 185 u32 flags);
178 void *astarg);
179 186
180 /* 187 /*
181 * Return the status of the current lock status block. The fs 188 * Return the status of the current lock status block. The fs
@@ -183,17 +190,17 @@ struct ocfs2_stack_operations {
183 * callback pulls out the stack-specific lksb, converts the status 190 * callback pulls out the stack-specific lksb, converts the status
184 * to a proper errno, and returns it. 191 * to a proper errno, and returns it.
185 */ 192 */
186 int (*lock_status)(union ocfs2_dlm_lksb *lksb); 193 int (*lock_status)(struct ocfs2_dlm_lksb *lksb);
187 194
188 /* 195 /*
189 * Return non-zero if the LVB is valid. 196 * Return non-zero if the LVB is valid.
190 */ 197 */
191 int (*lvb_valid)(union ocfs2_dlm_lksb *lksb); 198 int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb);
192 199
193 /* 200 /*
194 * Pull the lvb pointer off of the stack-specific lksb. 201 * Pull the lvb pointer off of the stack-specific lksb.
195 */ 202 */
196 void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb); 203 void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb);
197 204
198 /* 205 /*
199 * Cluster-aware posix locks 206 * Cluster-aware posix locks
@@ -210,7 +217,7 @@ struct ocfs2_stack_operations {
210 * This is an optoinal debugging hook. If provided, the 217 * This is an optoinal debugging hook. If provided, the
211 * stack can dump debugging information about this lock. 218 * stack can dump debugging information about this lock.
212 */ 219 */
213 void (*dump_lksb)(union ocfs2_dlm_lksb *lksb); 220 void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
214}; 221};
215 222
216/* 223/*
@@ -226,7 +233,7 @@ struct ocfs2_stack_plugin {
226 /* These are managed by the stackglue code. */ 233 /* These are managed by the stackglue code. */
227 struct list_head sp_list; 234 struct list_head sp_list;
228 unsigned int sp_count; 235 unsigned int sp_count;
229 struct ocfs2_locking_protocol *sp_proto; 236 struct ocfs2_protocol_version sp_max_proto;
230}; 237};
231 238
232 239
@@ -234,10 +241,22 @@ struct ocfs2_stack_plugin {
234int ocfs2_cluster_connect(const char *stack_name, 241int ocfs2_cluster_connect(const char *stack_name,
235 const char *group, 242 const char *group,
236 int grouplen, 243 int grouplen,
244 struct ocfs2_locking_protocol *lproto,
237 void (*recovery_handler)(int node_num, 245 void (*recovery_handler)(int node_num,
238 void *recovery_data), 246 void *recovery_data),
239 void *recovery_data, 247 void *recovery_data,
240 struct ocfs2_cluster_connection **conn); 248 struct ocfs2_cluster_connection **conn);
249/*
250 * Used by callers that don't store their stack name. They must ensure
251 * all nodes have the same stack.
252 */
253int ocfs2_cluster_connect_agnostic(const char *group,
254 int grouplen,
255 struct ocfs2_locking_protocol *lproto,
256 void (*recovery_handler)(int node_num,
257 void *recovery_data),
258 void *recovery_data,
259 struct ocfs2_cluster_connection **conn);
241int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, 260int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
242 int hangup_pending); 261 int hangup_pending);
243void ocfs2_cluster_hangup(const char *group, int grouplen); 262void ocfs2_cluster_hangup(const char *group, int grouplen);
@@ -246,26 +265,24 @@ int ocfs2_cluster_this_node(unsigned int *node);
246struct ocfs2_lock_res; 265struct ocfs2_lock_res;
247int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, 266int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
248 int mode, 267 int mode,
249 union ocfs2_dlm_lksb *lksb, 268 struct ocfs2_dlm_lksb *lksb,
250 u32 flags, 269 u32 flags,
251 void *name, 270 void *name,
252 unsigned int namelen, 271 unsigned int namelen);
253 struct ocfs2_lock_res *astarg);
254int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, 272int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
255 union ocfs2_dlm_lksb *lksb, 273 struct ocfs2_dlm_lksb *lksb,
256 u32 flags, 274 u32 flags);
257 struct ocfs2_lock_res *astarg);
258 275
259int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb); 276int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb);
260int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb); 277int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb);
261void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb); 278void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb);
262void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb); 279void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb);
263 280
264int ocfs2_stack_supports_plocks(void); 281int ocfs2_stack_supports_plocks(void);
265int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, 282int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
266 struct file *file, int cmd, struct file_lock *fl); 283 struct file *file, int cmd, struct file_lock *fl);
267 284
268void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto); 285void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto);
269 286
270 287
271/* Used by stack plugins */ 288/* Used by stack plugins */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c30b644d9572..19ba00f28547 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -51,7 +51,7 @@
51#define ALLOC_NEW_GROUP 0x1 51#define ALLOC_NEW_GROUP 0x1
52#define ALLOC_GROUPS_FROM_GLOBAL 0x2 52#define ALLOC_GROUPS_FROM_GLOBAL 0x2
53 53
54#define OCFS2_MAX_INODES_TO_STEAL 1024 54#define OCFS2_MAX_TO_STEAL 1024
55 55
56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
@@ -95,13 +95,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
95 struct buffer_head *group_bh, 95 struct buffer_head *group_bh,
96 unsigned int bit_off, 96 unsigned int bit_off,
97 unsigned int num_bits); 97 unsigned int num_bits);
98static inline int ocfs2_block_group_clear_bits(handle_t *handle,
99 struct inode *alloc_inode,
100 struct ocfs2_group_desc *bg,
101 struct buffer_head *group_bh,
102 unsigned int bit_off,
103 unsigned int num_bits);
104
105static int ocfs2_relink_block_group(handle_t *handle, 98static int ocfs2_relink_block_group(handle_t *handle,
106 struct inode *alloc_inode, 99 struct inode *alloc_inode,
107 struct buffer_head *fe_bh, 100 struct buffer_head *fe_bh,
@@ -152,7 +145,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
152 145
153#define do_error(fmt, ...) \ 146#define do_error(fmt, ...) \
154 do{ \ 147 do{ \
155 if (clean_error) \ 148 if (resize) \
156 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ 149 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
157 else \ 150 else \
158 ocfs2_error(sb, fmt, ##__VA_ARGS__); \ 151 ocfs2_error(sb, fmt, ##__VA_ARGS__); \
@@ -160,7 +153,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
160 153
161static int ocfs2_validate_gd_self(struct super_block *sb, 154static int ocfs2_validate_gd_self(struct super_block *sb,
162 struct buffer_head *bh, 155 struct buffer_head *bh,
163 int clean_error) 156 int resize)
164{ 157{
165 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 158 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
166 159
@@ -211,7 +204,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
211static int ocfs2_validate_gd_parent(struct super_block *sb, 204static int ocfs2_validate_gd_parent(struct super_block *sb,
212 struct ocfs2_dinode *di, 205 struct ocfs2_dinode *di,
213 struct buffer_head *bh, 206 struct buffer_head *bh,
214 int clean_error) 207 int resize)
215{ 208{
216 unsigned int max_bits; 209 unsigned int max_bits;
217 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 210 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -233,8 +226,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
233 return -EINVAL; 226 return -EINVAL;
234 } 227 }
235 228
236 if (le16_to_cpu(gd->bg_chain) >= 229 /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
237 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { 230 if ((le16_to_cpu(gd->bg_chain) >
231 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
232 ((le16_to_cpu(gd->bg_chain) ==
233 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
238 do_error("Group descriptor #%llu has bad chain %u", 234 do_error("Group descriptor #%llu has bad chain %u",
239 (unsigned long long)bh->b_blocknr, 235 (unsigned long long)bh->b_blocknr,
240 le16_to_cpu(gd->bg_chain)); 236 le16_to_cpu(gd->bg_chain));
@@ -637,12 +633,113 @@ bail:
637 return status; 633 return status;
638} 634}
639 635
636static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
637{
638 spin_lock(&osb->osb_lock);
639 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
640 spin_unlock(&osb->osb_lock);
641 atomic_set(&osb->s_num_inodes_stolen, 0);
642}
643
644static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
645{
646 spin_lock(&osb->osb_lock);
647 osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
648 spin_unlock(&osb->osb_lock);
649 atomic_set(&osb->s_num_meta_stolen, 0);
650}
651
652void ocfs2_init_steal_slots(struct ocfs2_super *osb)
653{
654 ocfs2_init_inode_steal_slot(osb);
655 ocfs2_init_meta_steal_slot(osb);
656}
657
658static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
659{
660 spin_lock(&osb->osb_lock);
661 if (type == INODE_ALLOC_SYSTEM_INODE)
662 osb->s_inode_steal_slot = slot;
663 else if (type == EXTENT_ALLOC_SYSTEM_INODE)
664 osb->s_meta_steal_slot = slot;
665 spin_unlock(&osb->osb_lock);
666}
667
668static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
669{
670 int slot = OCFS2_INVALID_SLOT;
671
672 spin_lock(&osb->osb_lock);
673 if (type == INODE_ALLOC_SYSTEM_INODE)
674 slot = osb->s_inode_steal_slot;
675 else if (type == EXTENT_ALLOC_SYSTEM_INODE)
676 slot = osb->s_meta_steal_slot;
677 spin_unlock(&osb->osb_lock);
678
679 return slot;
680}
681
682static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
683{
684 return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
685}
686
687static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
688{
689 return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
690}
691
692static int ocfs2_steal_resource(struct ocfs2_super *osb,
693 struct ocfs2_alloc_context *ac,
694 int type)
695{
696 int i, status = -ENOSPC;
697 int slot = __ocfs2_get_steal_slot(osb, type);
698
699 /* Start to steal resource from the first slot after ours. */
700 if (slot == OCFS2_INVALID_SLOT)
701 slot = osb->slot_num + 1;
702
703 for (i = 0; i < osb->max_slots; i++, slot++) {
704 if (slot == osb->max_slots)
705 slot = 0;
706
707 if (slot == osb->slot_num)
708 continue;
709
710 status = ocfs2_reserve_suballoc_bits(osb, ac,
711 type,
712 (u32)slot, NULL,
713 NOT_ALLOC_NEW_GROUP);
714 if (status >= 0) {
715 __ocfs2_set_steal_slot(osb, slot, type);
716 break;
717 }
718
719 ocfs2_free_ac_resource(ac);
720 }
721
722 return status;
723}
724
725static int ocfs2_steal_inode(struct ocfs2_super *osb,
726 struct ocfs2_alloc_context *ac)
727{
728 return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
729}
730
731static int ocfs2_steal_meta(struct ocfs2_super *osb,
732 struct ocfs2_alloc_context *ac)
733{
734 return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
735}
736
640int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, 737int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
641 int blocks, 738 int blocks,
642 struct ocfs2_alloc_context **ac) 739 struct ocfs2_alloc_context **ac)
643{ 740{
644 int status; 741 int status;
645 u32 slot; 742 int slot = ocfs2_get_meta_steal_slot(osb);
646 743
647 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 744 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
648 if (!(*ac)) { 745 if (!(*ac)) {
@@ -653,12 +750,34 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
653 750
654 (*ac)->ac_bits_wanted = blocks; 751 (*ac)->ac_bits_wanted = blocks;
655 (*ac)->ac_which = OCFS2_AC_USE_META; 752 (*ac)->ac_which = OCFS2_AC_USE_META;
656 slot = osb->slot_num;
657 (*ac)->ac_group_search = ocfs2_block_group_search; 753 (*ac)->ac_group_search = ocfs2_block_group_search;
658 754
755 if (slot != OCFS2_INVALID_SLOT &&
756 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
757 goto extent_steal;
758
759 atomic_set(&osb->s_num_meta_stolen, 0);
659 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 760 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
660 EXTENT_ALLOC_SYSTEM_INODE, 761 EXTENT_ALLOC_SYSTEM_INODE,
661 slot, NULL, ALLOC_NEW_GROUP); 762 (u32)osb->slot_num, NULL,
763 ALLOC_NEW_GROUP);
764
765
766 if (status >= 0) {
767 status = 0;
768 if (slot != OCFS2_INVALID_SLOT)
769 ocfs2_init_meta_steal_slot(osb);
770 goto bail;
771 } else if (status < 0 && status != -ENOSPC) {
772 mlog_errno(status);
773 goto bail;
774 }
775
776 ocfs2_free_ac_resource(*ac);
777
778extent_steal:
779 status = ocfs2_steal_meta(osb, *ac);
780 atomic_inc(&osb->s_num_meta_stolen);
662 if (status < 0) { 781 if (status < 0) {
663 if (status != -ENOSPC) 782 if (status != -ENOSPC)
664 mlog_errno(status); 783 mlog_errno(status);
@@ -685,43 +804,11 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
685 ac); 804 ac);
686} 805}
687 806
688static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
689 struct ocfs2_alloc_context *ac)
690{
691 int i, status = -ENOSPC;
692 s16 slot = ocfs2_get_inode_steal_slot(osb);
693
694 /* Start to steal inodes from the first slot after ours. */
695 if (slot == OCFS2_INVALID_SLOT)
696 slot = osb->slot_num + 1;
697
698 for (i = 0; i < osb->max_slots; i++, slot++) {
699 if (slot == osb->max_slots)
700 slot = 0;
701
702 if (slot == osb->slot_num)
703 continue;
704
705 status = ocfs2_reserve_suballoc_bits(osb, ac,
706 INODE_ALLOC_SYSTEM_INODE,
707 slot, NULL,
708 NOT_ALLOC_NEW_GROUP);
709 if (status >= 0) {
710 ocfs2_set_inode_steal_slot(osb, slot);
711 break;
712 }
713
714 ocfs2_free_ac_resource(ac);
715 }
716
717 return status;
718}
719
720int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 807int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
721 struct ocfs2_alloc_context **ac) 808 struct ocfs2_alloc_context **ac)
722{ 809{
723 int status; 810 int status;
724 s16 slot = ocfs2_get_inode_steal_slot(osb); 811 int slot = ocfs2_get_inode_steal_slot(osb);
725 u64 alloc_group; 812 u64 alloc_group;
726 813
727 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 814 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
@@ -754,14 +841,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
754 * need to check our slots to see whether there is some space for us. 841 * need to check our slots to see whether there is some space for us.
755 */ 842 */
756 if (slot != OCFS2_INVALID_SLOT && 843 if (slot != OCFS2_INVALID_SLOT &&
757 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL) 844 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
758 goto inode_steal; 845 goto inode_steal;
759 846
760 atomic_set(&osb->s_num_inodes_stolen, 0); 847 atomic_set(&osb->s_num_inodes_stolen, 0);
761 alloc_group = osb->osb_inode_alloc_group; 848 alloc_group = osb->osb_inode_alloc_group;
762 status = ocfs2_reserve_suballoc_bits(osb, *ac, 849 status = ocfs2_reserve_suballoc_bits(osb, *ac,
763 INODE_ALLOC_SYSTEM_INODE, 850 INODE_ALLOC_SYSTEM_INODE,
764 osb->slot_num, 851 (u32)osb->slot_num,
765 &alloc_group, 852 &alloc_group,
766 ALLOC_NEW_GROUP | 853 ALLOC_NEW_GROUP |
767 ALLOC_GROUPS_FROM_GLOBAL); 854 ALLOC_GROUPS_FROM_GLOBAL);
@@ -789,7 +876,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
789 ocfs2_free_ac_resource(*ac); 876 ocfs2_free_ac_resource(*ac);
790 877
791inode_steal: 878inode_steal:
792 status = ocfs2_steal_inode_from_other_nodes(osb, *ac); 879 status = ocfs2_steal_inode(osb, *ac);
793 atomic_inc(&osb->s_num_inodes_stolen); 880 atomic_inc(&osb->s_num_inodes_stolen);
794 if (status < 0) { 881 if (status < 0) {
795 if (status != -ENOSPC) 882 if (status != -ENOSPC)
@@ -1884,18 +1971,18 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
1884 bits_wanted, cluster_start, num_clusters); 1971 bits_wanted, cluster_start, num_clusters);
1885} 1972}
1886 1973
1887static inline int ocfs2_block_group_clear_bits(handle_t *handle, 1974static int ocfs2_block_group_clear_bits(handle_t *handle,
1888 struct inode *alloc_inode, 1975 struct inode *alloc_inode,
1889 struct ocfs2_group_desc *bg, 1976 struct ocfs2_group_desc *bg,
1890 struct buffer_head *group_bh, 1977 struct buffer_head *group_bh,
1891 unsigned int bit_off, 1978 unsigned int bit_off,
1892 unsigned int num_bits) 1979 unsigned int num_bits,
1980 void (*undo_fn)(unsigned int bit,
1981 unsigned long *bmap))
1893{ 1982{
1894 int status; 1983 int status;
1895 unsigned int tmp; 1984 unsigned int tmp;
1896 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1897 struct ocfs2_group_desc *undo_bg = NULL; 1985 struct ocfs2_group_desc *undo_bg = NULL;
1898 int cluster_bitmap = 0;
1899 1986
1900 mlog_entry_void(); 1987 mlog_entry_void();
1901 1988
@@ -1905,20 +1992,18 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1905 1992
1906 mlog(0, "off = %u, num = %u\n", bit_off, num_bits); 1993 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1907 1994
1908 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1995 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
1909 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1910
1911 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1996 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1912 group_bh, journal_type); 1997 group_bh,
1998 undo_fn ?
1999 OCFS2_JOURNAL_ACCESS_UNDO :
2000 OCFS2_JOURNAL_ACCESS_WRITE);
1913 if (status < 0) { 2001 if (status < 0) {
1914 mlog_errno(status); 2002 mlog_errno(status);
1915 goto bail; 2003 goto bail;
1916 } 2004 }
1917 2005
1918 if (ocfs2_is_cluster_bitmap(alloc_inode)) 2006 if (undo_fn) {
1919 cluster_bitmap = 1;
1920
1921 if (cluster_bitmap) {
1922 jbd_lock_bh_state(group_bh); 2007 jbd_lock_bh_state(group_bh);
1923 undo_bg = (struct ocfs2_group_desc *) 2008 undo_bg = (struct ocfs2_group_desc *)
1924 bh2jh(group_bh)->b_committed_data; 2009 bh2jh(group_bh)->b_committed_data;
@@ -1929,13 +2014,13 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1929 while(tmp--) { 2014 while(tmp--) {
1930 ocfs2_clear_bit((bit_off + tmp), 2015 ocfs2_clear_bit((bit_off + tmp),
1931 (unsigned long *) bg->bg_bitmap); 2016 (unsigned long *) bg->bg_bitmap);
1932 if (cluster_bitmap) 2017 if (undo_fn)
1933 ocfs2_set_bit(bit_off + tmp, 2018 undo_fn(bit_off + tmp,
1934 (unsigned long *) undo_bg->bg_bitmap); 2019 (unsigned long *) undo_bg->bg_bitmap);
1935 } 2020 }
1936 le16_add_cpu(&bg->bg_free_bits_count, num_bits); 2021 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1937 2022
1938 if (cluster_bitmap) 2023 if (undo_fn)
1939 jbd_unlock_bh_state(group_bh); 2024 jbd_unlock_bh_state(group_bh);
1940 2025
1941 status = ocfs2_journal_dirty(handle, group_bh); 2026 status = ocfs2_journal_dirty(handle, group_bh);
@@ -1948,12 +2033,14 @@ bail:
1948/* 2033/*
1949 * expects the suballoc inode to already be locked. 2034 * expects the suballoc inode to already be locked.
1950 */ 2035 */
1951int ocfs2_free_suballoc_bits(handle_t *handle, 2036static int _ocfs2_free_suballoc_bits(handle_t *handle,
1952 struct inode *alloc_inode, 2037 struct inode *alloc_inode,
1953 struct buffer_head *alloc_bh, 2038 struct buffer_head *alloc_bh,
1954 unsigned int start_bit, 2039 unsigned int start_bit,
1955 u64 bg_blkno, 2040 u64 bg_blkno,
1956 unsigned int count) 2041 unsigned int count,
2042 void (*undo_fn)(unsigned int bit,
2043 unsigned long *bitmap))
1957{ 2044{
1958 int status = 0; 2045 int status = 0;
1959 u32 tmp_used; 2046 u32 tmp_used;
@@ -1988,7 +2075,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1988 2075
1989 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 2076 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1990 group, group_bh, 2077 group, group_bh,
1991 start_bit, count); 2078 start_bit, count, undo_fn);
1992 if (status < 0) { 2079 if (status < 0) {
1993 mlog_errno(status); 2080 mlog_errno(status);
1994 goto bail; 2081 goto bail;
@@ -2019,6 +2106,17 @@ bail:
2019 return status; 2106 return status;
2020} 2107}
2021 2108
2109int ocfs2_free_suballoc_bits(handle_t *handle,
2110 struct inode *alloc_inode,
2111 struct buffer_head *alloc_bh,
2112 unsigned int start_bit,
2113 u64 bg_blkno,
2114 unsigned int count)
2115{
2116 return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2117 start_bit, bg_blkno, count, NULL);
2118}
2119
2022int ocfs2_free_dinode(handle_t *handle, 2120int ocfs2_free_dinode(handle_t *handle,
2023 struct inode *inode_alloc_inode, 2121 struct inode *inode_alloc_inode,
2024 struct buffer_head *inode_alloc_bh, 2122 struct buffer_head *inode_alloc_bh,
@@ -2032,11 +2130,13 @@ int ocfs2_free_dinode(handle_t *handle,
2032 inode_alloc_bh, bit, bg_blkno, 1); 2130 inode_alloc_bh, bit, bg_blkno, 1);
2033} 2131}
2034 2132
2035int ocfs2_free_clusters(handle_t *handle, 2133static int _ocfs2_free_clusters(handle_t *handle,
2036 struct inode *bitmap_inode, 2134 struct inode *bitmap_inode,
2037 struct buffer_head *bitmap_bh, 2135 struct buffer_head *bitmap_bh,
2038 u64 start_blk, 2136 u64 start_blk,
2039 unsigned int num_clusters) 2137 unsigned int num_clusters,
2138 void (*undo_fn)(unsigned int bit,
2139 unsigned long *bitmap))
2040{ 2140{
2041 int status; 2141 int status;
2042 u16 bg_start_bit; 2142 u16 bg_start_bit;
@@ -2063,9 +2163,9 @@ int ocfs2_free_clusters(handle_t *handle,
2063 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n", 2163 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2064 (unsigned long long)bg_blkno, bg_start_bit); 2164 (unsigned long long)bg_blkno, bg_start_bit);
2065 2165
2066 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 2166 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2067 bg_start_bit, bg_blkno, 2167 bg_start_bit, bg_blkno,
2068 num_clusters); 2168 num_clusters, undo_fn);
2069 if (status < 0) { 2169 if (status < 0) {
2070 mlog_errno(status); 2170 mlog_errno(status);
2071 goto out; 2171 goto out;
@@ -2079,6 +2179,32 @@ out:
2079 return status; 2179 return status;
2080} 2180}
2081 2181
2182int ocfs2_free_clusters(handle_t *handle,
2183 struct inode *bitmap_inode,
2184 struct buffer_head *bitmap_bh,
2185 u64 start_blk,
2186 unsigned int num_clusters)
2187{
2188 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2189 start_blk, num_clusters,
2190 _ocfs2_set_bit);
2191}
2192
2193/*
2194 * Give never-used clusters back to the global bitmap. We don't need
2195 * to protect these bits in the undo buffer.
2196 */
2197int ocfs2_release_clusters(handle_t *handle,
2198 struct inode *bitmap_inode,
2199 struct buffer_head *bitmap_bh,
2200 u64 start_blk,
2201 unsigned int num_clusters)
2202{
2203 return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2204 start_blk, num_clusters,
2205 _ocfs2_clear_bit);
2206}
2207
2082static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) 2208static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2083{ 2209{
2084 printk("Block Group:\n"); 2210 printk("Block Group:\n");
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 8c9a78a43164..e0f46df357e6 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -56,6 +56,7 @@ struct ocfs2_alloc_context {
56 is the same as ~0 - unlimited */ 56 is the same as ~0 - unlimited */
57}; 57};
58 58
59void ocfs2_init_steal_slots(struct ocfs2_super *osb);
59void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); 60void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
60static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac) 61static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
61{ 62{
@@ -126,6 +127,11 @@ int ocfs2_free_clusters(handle_t *handle,
126 struct buffer_head *bitmap_bh, 127 struct buffer_head *bitmap_bh,
127 u64 start_blk, 128 u64 start_blk,
128 unsigned int num_clusters); 129 unsigned int num_clusters);
130int ocfs2_release_clusters(handle_t *handle,
131 struct inode *bitmap_inode,
132 struct buffer_head *bitmap_bh,
133 u64 start_blk,
134 unsigned int num_clusters);
129 135
130static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) 136static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
131{ 137{
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 26069917a9f5..dee03197a494 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -69,6 +69,7 @@
69#include "xattr.h" 69#include "xattr.h"
70#include "quota.h" 70#include "quota.h"
71#include "refcounttree.h" 71#include "refcounttree.h"
72#include "suballoc.h"
72 73
73#include "buffer_head_io.h" 74#include "buffer_head_io.h"
74 75
@@ -301,9 +302,12 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
301 302
302 spin_lock(&osb->osb_lock); 303 spin_lock(&osb->osb_lock);
303 out += snprintf(buf + out, len - out, 304 out += snprintf(buf + out, len - out,
304 "%10s => Slot: %d NumStolen: %d\n", "Steal", 305 "%10s => InodeSlot: %d StolenInodes: %d, "
306 "MetaSlot: %d StolenMeta: %d\n", "Steal",
305 osb->s_inode_steal_slot, 307 osb->s_inode_steal_slot,
306 atomic_read(&osb->s_num_inodes_stolen)); 308 atomic_read(&osb->s_num_inodes_stolen),
309 osb->s_meta_steal_slot,
310 atomic_read(&osb->s_num_meta_stolen));
307 spin_unlock(&osb->osb_lock); 311 spin_unlock(&osb->osb_lock);
308 312
309 out += snprintf(buf + out, len - out, "OrphanScan => "); 313 out += snprintf(buf + out, len - out, "OrphanScan => ");
@@ -1062,7 +1066,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1062 "file system, but write access is " 1066 "file system, but write access is "
1063 "unavailable.\n"); 1067 "unavailable.\n");
1064 else 1068 else
1065 mlog_errno(status); 1069 mlog_errno(status);
1066 goto read_super_error; 1070 goto read_super_error;
1067 } 1071 }
1068 1072
@@ -1997,7 +2001,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1997 osb->blocked_lock_count = 0; 2001 osb->blocked_lock_count = 0;
1998 spin_lock_init(&osb->osb_lock); 2002 spin_lock_init(&osb->osb_lock);
1999 spin_lock_init(&osb->osb_xattr_lock); 2003 spin_lock_init(&osb->osb_xattr_lock);
2000 ocfs2_init_inode_steal_slot(osb); 2004 ocfs2_init_steal_slots(osb);
2001 2005
2002 atomic_set(&osb->alloc_stats.moves, 0); 2006 atomic_set(&osb->alloc_stats.moves, 0);
2003 atomic_set(&osb->alloc_stats.local_data, 0); 2007 atomic_set(&osb->alloc_stats.local_data, 0);
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 49b133ccbf11..32499d213fc4 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -137,20 +137,20 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
137 } 137 }
138 138
139 memcpy(link, target, len); 139 memcpy(link, target, len);
140 nd_set_link(nd, link);
141 140
142bail: 141bail:
142 nd_set_link(nd, status ? ERR_PTR(status) : link);
143 brelse(bh); 143 brelse(bh);
144 144
145 mlog_exit(status); 145 mlog_exit(status);
146 return status ? ERR_PTR(status) : link; 146 return NULL;
147} 147}
148 148
149static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) 149static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
150{ 150{
151 char *link = cookie; 151 char *link = nd_get_link(nd);
152 152 if (!IS_ERR(link))
153 kfree(link); 153 kfree(link);
154} 154}
155 155
156const struct inode_operations ocfs2_symlink_inode_operations = { 156const struct inode_operations ocfs2_symlink_inode_operations = {
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 40e53702948c..bfe7190cdbf1 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -25,7 +25,6 @@
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h> 28#include <linux/highmem.h>
30 29
31#define MLOG_MASK_PREFIX ML_INODE 30#define MLOG_MASK_PREFIX ML_INODE
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index c61369342a27..a0a120e82b97 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -267,8 +267,8 @@ static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
267} 267}
268 268
269/* Warning: even if it returns true, this does *not* guarantee that 269/* Warning: even if it returns true, this does *not* guarantee that
270 * the block is stored in our inode metadata cache. 270 * the block is stored in our inode metadata cache.
271 * 271 *
272 * This can be called under lock_buffer() 272 * This can be called under lock_buffer()
273 */ 273 */
274int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci, 274int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 8fc6fb071c6d..3e7773089b96 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -116,10 +116,11 @@ static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
116}; 116};
117 117
118struct ocfs2_xattr_info { 118struct ocfs2_xattr_info {
119 int name_index; 119 int xi_name_index;
120 const char *name; 120 const char *xi_name;
121 const void *value; 121 int xi_name_len;
122 size_t value_len; 122 const void *xi_value;
123 size_t xi_value_len;
123}; 124};
124 125
125struct ocfs2_xattr_search { 126struct ocfs2_xattr_search {
@@ -137,6 +138,115 @@ struct ocfs2_xattr_search {
137 int not_found; 138 int not_found;
138}; 139};
139 140
141/* Operations on struct ocfs2_xa_entry */
142struct ocfs2_xa_loc;
143struct ocfs2_xa_loc_operations {
144 /*
145 * Journal functions
146 */
147 int (*xlo_journal_access)(handle_t *handle, struct ocfs2_xa_loc *loc,
148 int type);
149 void (*xlo_journal_dirty)(handle_t *handle, struct ocfs2_xa_loc *loc);
150
151 /*
152 * Return a pointer to the appropriate buffer in loc->xl_storage
153 * at the given offset from loc->xl_header.
154 */
155 void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset);
156
157 /* Can we reuse the existing entry for the new value? */
158 int (*xlo_can_reuse)(struct ocfs2_xa_loc *loc,
159 struct ocfs2_xattr_info *xi);
160
161 /* How much space is needed for the new value? */
162 int (*xlo_check_space)(struct ocfs2_xa_loc *loc,
163 struct ocfs2_xattr_info *xi);
164
165 /*
166 * Return the offset of the first name+value pair. This is
167 * the start of our downward-filling free space.
168 */
169 int (*xlo_get_free_start)(struct ocfs2_xa_loc *loc);
170
171 /*
172 * Remove the name+value at this location. Do whatever is
173 * appropriate with the remaining name+value pairs.
174 */
175 void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc);
176
177 /* Fill xl_entry with a new entry */
178 void (*xlo_add_entry)(struct ocfs2_xa_loc *loc, u32 name_hash);
179
180 /* Add name+value storage to an entry */
181 void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size);
182
183 /*
184 * Initialize the value buf's access and bh fields for this entry.
185 * ocfs2_xa_fill_value_buf() will handle the xv pointer.
186 */
187 void (*xlo_fill_value_buf)(struct ocfs2_xa_loc *loc,
188 struct ocfs2_xattr_value_buf *vb);
189};
190
191/*
192 * Describes an xattr entry location. This is a memory structure
193 * tracking the on-disk structure.
194 */
195struct ocfs2_xa_loc {
196 /* This xattr belongs to this inode */
197 struct inode *xl_inode;
198
199 /* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */
200 struct ocfs2_xattr_header *xl_header;
201
202 /* Bytes from xl_header to the end of the storage */
203 int xl_size;
204
205 /*
206 * The ocfs2_xattr_entry this location describes. If this is
207 * NULL, this location describes the on-disk structure where it
208 * would have been.
209 */
210 struct ocfs2_xattr_entry *xl_entry;
211
212 /*
213 * Internal housekeeping
214 */
215
216 /* Buffer(s) containing this entry */
217 void *xl_storage;
218
219 /* Operations on the storage backing this location */
220 const struct ocfs2_xa_loc_operations *xl_ops;
221};
222
223/*
224 * Convenience functions to calculate how much space is needed for a
225 * given name+value pair
226 */
227static int namevalue_size(int name_len, uint64_t value_len)
228{
229 if (value_len > OCFS2_XATTR_INLINE_SIZE)
230 return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
231 else
232 return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
233}
234
235static int namevalue_size_xi(struct ocfs2_xattr_info *xi)
236{
237 return namevalue_size(xi->xi_name_len, xi->xi_value_len);
238}
239
240static int namevalue_size_xe(struct ocfs2_xattr_entry *xe)
241{
242 u64 value_len = le64_to_cpu(xe->xe_value_size);
243
244 BUG_ON((value_len > OCFS2_XATTR_INLINE_SIZE) &&
245 ocfs2_xattr_is_local(xe));
246 return namevalue_size(xe->xe_name_len, value_len);
247}
248
249
140static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, 250static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
141 struct ocfs2_xattr_header *xh, 251 struct ocfs2_xattr_header *xh,
142 int index, 252 int index,
@@ -212,14 +322,6 @@ static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
212 return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits); 322 return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
213} 323}
214 324
215static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
216{
217 u16 len = sb->s_blocksize -
218 offsetof(struct ocfs2_xattr_header, xh_entries);
219
220 return len / sizeof(struct ocfs2_xattr_entry);
221}
222
223#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr) 325#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
224#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data) 326#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
225#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0)) 327#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
@@ -463,35 +565,22 @@ static u32 ocfs2_xattr_name_hash(struct inode *inode,
463 return hash; 565 return hash;
464} 566}
465 567
466/* 568static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
467 * ocfs2_xattr_hash_entry()
468 *
469 * Compute the hash of an extended attribute.
470 */
471static void ocfs2_xattr_hash_entry(struct inode *inode,
472 struct ocfs2_xattr_header *header,
473 struct ocfs2_xattr_entry *entry)
474{ 569{
475 u32 hash = 0; 570 return namevalue_size(name_len, value_len) +
476 char *name = (char *)header + le16_to_cpu(entry->xe_name_offset); 571 sizeof(struct ocfs2_xattr_entry);
477
478 hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
479 entry->xe_name_hash = cpu_to_le32(hash);
480
481 return;
482} 572}
483 573
484static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len) 574static int ocfs2_xi_entry_usage(struct ocfs2_xattr_info *xi)
485{ 575{
486 int size = 0; 576 return namevalue_size_xi(xi) +
487 577 sizeof(struct ocfs2_xattr_entry);
488 if (value_len <= OCFS2_XATTR_INLINE_SIZE) 578}
489 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
490 else
491 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
492 size += sizeof(struct ocfs2_xattr_entry);
493 579
494 return size; 580static int ocfs2_xe_entry_usage(struct ocfs2_xattr_entry *xe)
581{
582 return namevalue_size_xe(xe) +
583 sizeof(struct ocfs2_xattr_entry);
495} 584}
496 585
497int ocfs2_calc_security_init(struct inode *dir, 586int ocfs2_calc_security_init(struct inode *dir,
@@ -1308,452 +1397,897 @@ out:
1308 return ret; 1397 return ret;
1309} 1398}
1310 1399
1311static int ocfs2_xattr_cleanup(struct inode *inode, 1400static int ocfs2_xa_check_space_helper(int needed_space, int free_start,
1312 handle_t *handle, 1401 int num_entries)
1313 struct ocfs2_xattr_info *xi,
1314 struct ocfs2_xattr_search *xs,
1315 struct ocfs2_xattr_value_buf *vb,
1316 size_t offs)
1317{ 1402{
1318 int ret = 0; 1403 int free_space;
1319 size_t name_len = strlen(xi->name);
1320 void *val = xs->base + offs;
1321 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1322 1404
1323 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, 1405 if (!needed_space)
1324 OCFS2_JOURNAL_ACCESS_WRITE); 1406 return 0;
1325 if (ret) {
1326 mlog_errno(ret);
1327 goto out;
1328 }
1329 /* Decrease xattr count */
1330 le16_add_cpu(&xs->header->xh_count, -1);
1331 /* Remove the xattr entry and tree root which has already be set*/
1332 memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
1333 memset(val, 0, size);
1334 1407
1335 ret = ocfs2_journal_dirty(handle, vb->vb_bh); 1408 free_space = free_start -
1336 if (ret < 0) 1409 sizeof(struct ocfs2_xattr_header) -
1337 mlog_errno(ret); 1410 (num_entries * sizeof(struct ocfs2_xattr_entry)) -
1338out: 1411 OCFS2_XATTR_HEADER_GAP;
1339 return ret; 1412 if (free_space < 0)
1413 return -EIO;
1414 if (free_space < needed_space)
1415 return -ENOSPC;
1416
1417 return 0;
1340} 1418}
1341 1419
1342static int ocfs2_xattr_update_entry(struct inode *inode, 1420static int ocfs2_xa_journal_access(handle_t *handle, struct ocfs2_xa_loc *loc,
1343 handle_t *handle, 1421 int type)
1344 struct ocfs2_xattr_info *xi,
1345 struct ocfs2_xattr_search *xs,
1346 struct ocfs2_xattr_value_buf *vb,
1347 size_t offs)
1348{ 1422{
1349 int ret; 1423 return loc->xl_ops->xlo_journal_access(handle, loc, type);
1424}
1350 1425
1351 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, 1426static void ocfs2_xa_journal_dirty(handle_t *handle, struct ocfs2_xa_loc *loc)
1352 OCFS2_JOURNAL_ACCESS_WRITE); 1427{
1353 if (ret) { 1428 loc->xl_ops->xlo_journal_dirty(handle, loc);
1354 mlog_errno(ret); 1429}
1355 goto out;
1356 }
1357 1430
1358 xs->here->xe_name_offset = cpu_to_le16(offs); 1431/* Give a pointer into the storage for the given offset */
1359 xs->here->xe_value_size = cpu_to_le64(xi->value_len); 1432static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset)
1360 if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE) 1433{
1361 ocfs2_xattr_set_local(xs->here, 1); 1434 BUG_ON(offset >= loc->xl_size);
1362 else 1435 return loc->xl_ops->xlo_offset_pointer(loc, offset);
1363 ocfs2_xattr_set_local(xs->here, 0); 1436}
1364 ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
1365 1437
1366 ret = ocfs2_journal_dirty(handle, vb->vb_bh); 1438/*
1367 if (ret < 0) 1439 * Wipe the name+value pair and allow the storage to reclaim it. This
1368 mlog_errno(ret); 1440 * must be followed by either removal of the entry or a call to
1369out: 1441 * ocfs2_xa_add_namevalue().
1370 return ret; 1442 */
1443static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc)
1444{
1445 loc->xl_ops->xlo_wipe_namevalue(loc);
1371} 1446}
1372 1447
1373/* 1448/*
1374 * ocfs2_xattr_set_value_outside() 1449 * Find lowest offset to a name+value pair. This is the start of our
1375 * 1450 * downward-growing free space.
1376 * Set large size value in B tree.
1377 */ 1451 */
1378static int ocfs2_xattr_set_value_outside(struct inode *inode, 1452static int ocfs2_xa_get_free_start(struct ocfs2_xa_loc *loc)
1379 struct ocfs2_xattr_info *xi,
1380 struct ocfs2_xattr_search *xs,
1381 struct ocfs2_xattr_set_ctxt *ctxt,
1382 struct ocfs2_xattr_value_buf *vb,
1383 size_t offs)
1384{ 1453{
1385 size_t name_len = strlen(xi->name); 1454 return loc->xl_ops->xlo_get_free_start(loc);
1386 void *val = xs->base + offs; 1455}
1387 struct ocfs2_xattr_value_root *xv = NULL;
1388 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1389 int ret = 0;
1390 1456
1391 memset(val, 0, size); 1457/* Can we reuse loc->xl_entry for xi? */
1392 memcpy(val, xi->name, name_len); 1458static int ocfs2_xa_can_reuse_entry(struct ocfs2_xa_loc *loc,
1393 xv = (struct ocfs2_xattr_value_root *) 1459 struct ocfs2_xattr_info *xi)
1394 (val + OCFS2_XATTR_SIZE(name_len)); 1460{
1395 xv->xr_clusters = 0; 1461 return loc->xl_ops->xlo_can_reuse(loc, xi);
1396 xv->xr_last_eb_blk = 0; 1462}
1397 xv->xr_list.l_tree_depth = 0; 1463
1398 xv->xr_list.l_count = cpu_to_le16(1); 1464/* How much free space is needed to set the new value */
1399 xv->xr_list.l_next_free_rec = 0; 1465static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
1400 vb->vb_xv = xv; 1466 struct ocfs2_xattr_info *xi)
1401 1467{
1402 ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt); 1468 return loc->xl_ops->xlo_check_space(loc, xi);
1403 if (ret < 0) { 1469}
1404 mlog_errno(ret); 1470
1405 return ret; 1471static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1472{
1473 loc->xl_ops->xlo_add_entry(loc, name_hash);
1474 loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
1475 /*
1476 * We can't leave the new entry's xe_name_offset at zero or
1477 * add_namevalue() will go nuts. We set it to the size of our
1478 * storage so that it can never be less than any other entry.
1479 */
1480 loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
1481}
1482
1483static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
1484 struct ocfs2_xattr_info *xi)
1485{
1486 int size = namevalue_size_xi(xi);
1487 int nameval_offset;
1488 char *nameval_buf;
1489
1490 loc->xl_ops->xlo_add_namevalue(loc, size);
1491 loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
1492 loc->xl_entry->xe_name_len = xi->xi_name_len;
1493 ocfs2_xattr_set_type(loc->xl_entry, xi->xi_name_index);
1494 ocfs2_xattr_set_local(loc->xl_entry,
1495 xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE);
1496
1497 nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1498 nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
1499 memset(nameval_buf, 0, size);
1500 memcpy(nameval_buf, xi->xi_name, xi->xi_name_len);
1501}
1502
1503static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc,
1504 struct ocfs2_xattr_value_buf *vb)
1505{
1506 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1507 int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
1508
1509 /* Value bufs are for value trees */
1510 BUG_ON(ocfs2_xattr_is_local(loc->xl_entry));
1511 BUG_ON(namevalue_size_xe(loc->xl_entry) !=
1512 (name_size + OCFS2_XATTR_ROOT_SIZE));
1513
1514 loc->xl_ops->xlo_fill_value_buf(loc, vb);
1515 vb->vb_xv =
1516 (struct ocfs2_xattr_value_root *)ocfs2_xa_offset_pointer(loc,
1517 nameval_offset +
1518 name_size);
1519}
1520
1521static int ocfs2_xa_block_journal_access(handle_t *handle,
1522 struct ocfs2_xa_loc *loc, int type)
1523{
1524 struct buffer_head *bh = loc->xl_storage;
1525 ocfs2_journal_access_func access;
1526
1527 if (loc->xl_size == (bh->b_size -
1528 offsetof(struct ocfs2_xattr_block,
1529 xb_attrs.xb_header)))
1530 access = ocfs2_journal_access_xb;
1531 else
1532 access = ocfs2_journal_access_di;
1533 return access(handle, INODE_CACHE(loc->xl_inode), bh, type);
1534}
1535
1536static void ocfs2_xa_block_journal_dirty(handle_t *handle,
1537 struct ocfs2_xa_loc *loc)
1538{
1539 struct buffer_head *bh = loc->xl_storage;
1540
1541 ocfs2_journal_dirty(handle, bh);
1542}
1543
1544static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc,
1545 int offset)
1546{
1547 return (char *)loc->xl_header + offset;
1548}
1549
1550static int ocfs2_xa_block_can_reuse(struct ocfs2_xa_loc *loc,
1551 struct ocfs2_xattr_info *xi)
1552{
1553 /*
1554 * Block storage is strict. If the sizes aren't exact, we will
1555 * remove the old one and reinsert the new.
1556 */
1557 return namevalue_size_xe(loc->xl_entry) ==
1558 namevalue_size_xi(xi);
1559}
1560
1561static int ocfs2_xa_block_get_free_start(struct ocfs2_xa_loc *loc)
1562{
1563 struct ocfs2_xattr_header *xh = loc->xl_header;
1564 int i, count = le16_to_cpu(xh->xh_count);
1565 int offset, free_start = loc->xl_size;
1566
1567 for (i = 0; i < count; i++) {
1568 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
1569 if (offset < free_start)
1570 free_start = offset;
1406 } 1571 }
1407 ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs); 1572
1408 if (ret < 0) { 1573 return free_start;
1409 mlog_errno(ret); 1574}
1410 return ret; 1575
1576static int ocfs2_xa_block_check_space(struct ocfs2_xa_loc *loc,
1577 struct ocfs2_xattr_info *xi)
1578{
1579 int count = le16_to_cpu(loc->xl_header->xh_count);
1580 int free_start = ocfs2_xa_get_free_start(loc);
1581 int needed_space = ocfs2_xi_entry_usage(xi);
1582
1583 /*
1584 * Block storage will reclaim the original entry before inserting
1585 * the new value, so we only need the difference. If the new
1586 * entry is smaller than the old one, we don't need anything.
1587 */
1588 if (loc->xl_entry) {
1589 /* Don't need space if we're reusing! */
1590 if (ocfs2_xa_can_reuse_entry(loc, xi))
1591 needed_space = 0;
1592 else
1593 needed_space -= ocfs2_xe_entry_usage(loc->xl_entry);
1411 } 1594 }
1412 ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb, 1595 if (needed_space < 0)
1413 xi->value, xi->value_len); 1596 needed_space = 0;
1414 if (ret < 0) 1597 return ocfs2_xa_check_space_helper(needed_space, free_start, count);
1415 mlog_errno(ret); 1598}
1416 1599
1417 return ret; 1600/*
1601 * Block storage for xattrs keeps the name+value pairs compacted. When
1602 * we remove one, we have to shift any that preceded it towards the end.
1603 */
1604static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
1605{
1606 int i, offset;
1607 int namevalue_offset, first_namevalue_offset, namevalue_size;
1608 struct ocfs2_xattr_entry *entry = loc->xl_entry;
1609 struct ocfs2_xattr_header *xh = loc->xl_header;
1610 int count = le16_to_cpu(xh->xh_count);
1611
1612 namevalue_offset = le16_to_cpu(entry->xe_name_offset);
1613 namevalue_size = namevalue_size_xe(entry);
1614 first_namevalue_offset = ocfs2_xa_get_free_start(loc);
1615
1616 /* Shift the name+value pairs */
1617 memmove((char *)xh + first_namevalue_offset + namevalue_size,
1618 (char *)xh + first_namevalue_offset,
1619 namevalue_offset - first_namevalue_offset);
1620 memset((char *)xh + first_namevalue_offset, 0, namevalue_size);
1621
1622 /* Now tell xh->xh_entries about it */
1623 for (i = 0; i < count; i++) {
1624 offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
1625 if (offset <= namevalue_offset)
1626 le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
1627 namevalue_size);
1628 }
1629
1630 /*
1631 * Note that we don't update xh_free_start or xh_name_value_len
1632 * because they're not used in block-stored xattrs.
1633 */
1634}
1635
1636static void ocfs2_xa_block_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1637{
1638 int count = le16_to_cpu(loc->xl_header->xh_count);
1639 loc->xl_entry = &(loc->xl_header->xh_entries[count]);
1640 le16_add_cpu(&loc->xl_header->xh_count, 1);
1641 memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
1642}
1643
1644static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size)
1645{
1646 int free_start = ocfs2_xa_get_free_start(loc);
1647
1648 loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size);
1649}
1650
1651static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc,
1652 struct ocfs2_xattr_value_buf *vb)
1653{
1654 struct buffer_head *bh = loc->xl_storage;
1655
1656 if (loc->xl_size == (bh->b_size -
1657 offsetof(struct ocfs2_xattr_block,
1658 xb_attrs.xb_header)))
1659 vb->vb_access = ocfs2_journal_access_xb;
1660 else
1661 vb->vb_access = ocfs2_journal_access_di;
1662 vb->vb_bh = bh;
1418} 1663}
1419 1664
1420/* 1665/*
1421 * ocfs2_xattr_set_entry_local() 1666 * Operations for xattrs stored in blocks. This includes inline inode
1422 * 1667 * storage and unindexed ocfs2_xattr_blocks.
1423 * Set, replace or remove extended attribute in local.
1424 */ 1668 */
1425static void ocfs2_xattr_set_entry_local(struct inode *inode, 1669static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = {
1426 struct ocfs2_xattr_info *xi, 1670 .xlo_journal_access = ocfs2_xa_block_journal_access,
1427 struct ocfs2_xattr_search *xs, 1671 .xlo_journal_dirty = ocfs2_xa_block_journal_dirty,
1428 struct ocfs2_xattr_entry *last, 1672 .xlo_offset_pointer = ocfs2_xa_block_offset_pointer,
1429 size_t min_offs) 1673 .xlo_check_space = ocfs2_xa_block_check_space,
1674 .xlo_can_reuse = ocfs2_xa_block_can_reuse,
1675 .xlo_get_free_start = ocfs2_xa_block_get_free_start,
1676 .xlo_wipe_namevalue = ocfs2_xa_block_wipe_namevalue,
1677 .xlo_add_entry = ocfs2_xa_block_add_entry,
1678 .xlo_add_namevalue = ocfs2_xa_block_add_namevalue,
1679 .xlo_fill_value_buf = ocfs2_xa_block_fill_value_buf,
1680};
1681
1682static int ocfs2_xa_bucket_journal_access(handle_t *handle,
1683 struct ocfs2_xa_loc *loc, int type)
1430{ 1684{
1431 size_t name_len = strlen(xi->name); 1685 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1432 int i;
1433 1686
1434 if (xi->value && xs->not_found) { 1687 return ocfs2_xattr_bucket_journal_access(handle, bucket, type);
1435 /* Insert the new xattr entry. */ 1688}
1436 le16_add_cpu(&xs->header->xh_count, 1); 1689
1437 ocfs2_xattr_set_type(last, xi->name_index); 1690static void ocfs2_xa_bucket_journal_dirty(handle_t *handle,
1438 ocfs2_xattr_set_local(last, 1); 1691 struct ocfs2_xa_loc *loc)
1439 last->xe_name_len = name_len; 1692{
1440 } else { 1693 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1441 void *first_val; 1694
1442 void *val; 1695 ocfs2_xattr_bucket_journal_dirty(handle, bucket);
1443 size_t offs, size; 1696}
1444 1697
1445 first_val = xs->base + min_offs; 1698static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
1446 offs = le16_to_cpu(xs->here->xe_name_offset); 1699 int offset)
1447 val = xs->base + offs; 1700{
1448 1701 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1449 if (le64_to_cpu(xs->here->xe_value_size) > 1702 int block, block_offset;
1450 OCFS2_XATTR_INLINE_SIZE) 1703
1451 size = OCFS2_XATTR_SIZE(name_len) + 1704 /* The header is at the front of the bucket */
1452 OCFS2_XATTR_ROOT_SIZE; 1705 block = offset >> loc->xl_inode->i_sb->s_blocksize_bits;
1706 block_offset = offset % loc->xl_inode->i_sb->s_blocksize;
1707
1708 return bucket_block(bucket, block) + block_offset;
1709}
1710
1711static int ocfs2_xa_bucket_can_reuse(struct ocfs2_xa_loc *loc,
1712 struct ocfs2_xattr_info *xi)
1713{
1714 return namevalue_size_xe(loc->xl_entry) >=
1715 namevalue_size_xi(xi);
1716}
1717
1718static int ocfs2_xa_bucket_get_free_start(struct ocfs2_xa_loc *loc)
1719{
1720 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1721 return le16_to_cpu(bucket_xh(bucket)->xh_free_start);
1722}
1723
1724static int ocfs2_bucket_align_free_start(struct super_block *sb,
1725 int free_start, int size)
1726{
1727 /*
1728 * We need to make sure that the name+value pair fits within
1729 * one block.
1730 */
1731 if (((free_start - size) >> sb->s_blocksize_bits) !=
1732 ((free_start - 1) >> sb->s_blocksize_bits))
1733 free_start -= free_start % sb->s_blocksize;
1734
1735 return free_start;
1736}
1737
1738static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc,
1739 struct ocfs2_xattr_info *xi)
1740{
1741 int rc;
1742 int count = le16_to_cpu(loc->xl_header->xh_count);
1743 int free_start = ocfs2_xa_get_free_start(loc);
1744 int needed_space = ocfs2_xi_entry_usage(xi);
1745 int size = namevalue_size_xi(xi);
1746 struct super_block *sb = loc->xl_inode->i_sb;
1747
1748 /*
1749 * Bucket storage does not reclaim name+value pairs it cannot
1750 * reuse. They live as holes until the bucket fills, and then
1751 * the bucket is defragmented. However, the bucket can reclaim
1752 * the ocfs2_xattr_entry.
1753 */
1754 if (loc->xl_entry) {
1755 /* Don't need space if we're reusing! */
1756 if (ocfs2_xa_can_reuse_entry(loc, xi))
1757 needed_space = 0;
1453 else 1758 else
1454 size = OCFS2_XATTR_SIZE(name_len) + 1759 needed_space -= sizeof(struct ocfs2_xattr_entry);
1455 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); 1760 }
1456 1761 BUG_ON(needed_space < 0);
1457 if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
1458 OCFS2_XATTR_SIZE(xi->value_len)) {
1459 /* The old and the new value have the
1460 same size. Just replace the value. */
1461 ocfs2_xattr_set_local(xs->here, 1);
1462 xs->here->xe_value_size = cpu_to_le64(xi->value_len);
1463 /* Clear value bytes. */
1464 memset(val + OCFS2_XATTR_SIZE(name_len),
1465 0,
1466 OCFS2_XATTR_SIZE(xi->value_len));
1467 memcpy(val + OCFS2_XATTR_SIZE(name_len),
1468 xi->value,
1469 xi->value_len);
1470 return;
1471 }
1472 /* Remove the old name+value. */
1473 memmove(first_val + size, first_val, val - first_val);
1474 memset(first_val, 0, size);
1475 xs->here->xe_name_hash = 0;
1476 xs->here->xe_name_offset = 0;
1477 ocfs2_xattr_set_local(xs->here, 1);
1478 xs->here->xe_value_size = 0;
1479
1480 min_offs += size;
1481
1482 /* Adjust all value offsets. */
1483 last = xs->header->xh_entries;
1484 for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
1485 size_t o = le16_to_cpu(last->xe_name_offset);
1486
1487 if (o < offs)
1488 last->xe_name_offset = cpu_to_le16(o + size);
1489 last += 1;
1490 }
1491 1762
1492 if (!xi->value) { 1763 if (free_start < size) {
1493 /* Remove the old entry. */ 1764 if (needed_space)
1494 last -= 1; 1765 return -ENOSPC;
1495 memmove(xs->here, xs->here + 1, 1766 } else {
1496 (void *)last - (void *)xs->here); 1767 /*
1497 memset(last, 0, sizeof(struct ocfs2_xattr_entry)); 1768 * First we check if it would fit in the first place.
1498 le16_add_cpu(&xs->header->xh_count, -1); 1769 * Below, we align the free start to a block. This may
1499 } 1770 * slide us below the minimum gap. By checking unaligned
1771 * first, we avoid that error.
1772 */
1773 rc = ocfs2_xa_check_space_helper(needed_space, free_start,
1774 count);
1775 if (rc)
1776 return rc;
1777 free_start = ocfs2_bucket_align_free_start(sb, free_start,
1778 size);
1500 } 1779 }
1501 if (xi->value) { 1780 return ocfs2_xa_check_space_helper(needed_space, free_start, count);
1502 /* Insert the new name+value. */ 1781}
1503 size_t size = OCFS2_XATTR_SIZE(name_len) + 1782
1504 OCFS2_XATTR_SIZE(xi->value_len); 1783static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc)
1505 void *val = xs->base + min_offs - size; 1784{
1785 le16_add_cpu(&loc->xl_header->xh_name_value_len,
1786 -namevalue_size_xe(loc->xl_entry));
1787}
1506 1788
1507 xs->here->xe_name_offset = cpu_to_le16(min_offs - size); 1789static void ocfs2_xa_bucket_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
1508 memset(val, 0, size); 1790{
1509 memcpy(val, xi->name, name_len); 1791 struct ocfs2_xattr_header *xh = loc->xl_header;
1510 memcpy(val + OCFS2_XATTR_SIZE(name_len), 1792 int count = le16_to_cpu(xh->xh_count);
1511 xi->value, 1793 int low = 0, high = count - 1, tmp;
1512 xi->value_len); 1794 struct ocfs2_xattr_entry *tmp_xe;
1513 xs->here->xe_value_size = cpu_to_le64(xi->value_len); 1795
1514 ocfs2_xattr_set_local(xs->here, 1); 1796 /*
1515 ocfs2_xattr_hash_entry(inode, xs->header, xs->here); 1797 * We keep buckets sorted by name_hash, so we need to find
1798 * our insert place.
1799 */
1800 while (low <= high && count) {
1801 tmp = (low + high) / 2;
1802 tmp_xe = &xh->xh_entries[tmp];
1803
1804 if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
1805 low = tmp + 1;
1806 else if (name_hash < le32_to_cpu(tmp_xe->xe_name_hash))
1807 high = tmp - 1;
1808 else {
1809 low = tmp;
1810 break;
1811 }
1516 } 1812 }
1517 1813
1518 return; 1814 if (low != count)
1815 memmove(&xh->xh_entries[low + 1],
1816 &xh->xh_entries[low],
1817 ((count - low) * sizeof(struct ocfs2_xattr_entry)));
1818
1819 le16_add_cpu(&xh->xh_count, 1);
1820 loc->xl_entry = &xh->xh_entries[low];
1821 memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
1822}
1823
1824static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size)
1825{
1826 int free_start = ocfs2_xa_get_free_start(loc);
1827 struct ocfs2_xattr_header *xh = loc->xl_header;
1828 struct super_block *sb = loc->xl_inode->i_sb;
1829 int nameval_offset;
1830
1831 free_start = ocfs2_bucket_align_free_start(sb, free_start, size);
1832 nameval_offset = free_start - size;
1833 loc->xl_entry->xe_name_offset = cpu_to_le16(nameval_offset);
1834 xh->xh_free_start = cpu_to_le16(nameval_offset);
1835 le16_add_cpu(&xh->xh_name_value_len, size);
1836
1837}
1838
1839static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc,
1840 struct ocfs2_xattr_value_buf *vb)
1841{
1842 struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
1843 struct super_block *sb = loc->xl_inode->i_sb;
1844 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
1845 int size = namevalue_size_xe(loc->xl_entry);
1846 int block_offset = nameval_offset >> sb->s_blocksize_bits;
1847
1848 /* Values are not allowed to straddle block boundaries */
1849 BUG_ON(block_offset !=
1850 ((nameval_offset + size - 1) >> sb->s_blocksize_bits));
1851 /* We expect the bucket to be filled in */
1852 BUG_ON(!bucket->bu_bhs[block_offset]);
1853
1854 vb->vb_access = ocfs2_journal_access;
1855 vb->vb_bh = bucket->bu_bhs[block_offset];
1856}
1857
1858/* Operations for xattrs stored in buckets. */
1859static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
1860 .xlo_journal_access = ocfs2_xa_bucket_journal_access,
1861 .xlo_journal_dirty = ocfs2_xa_bucket_journal_dirty,
1862 .xlo_offset_pointer = ocfs2_xa_bucket_offset_pointer,
1863 .xlo_check_space = ocfs2_xa_bucket_check_space,
1864 .xlo_can_reuse = ocfs2_xa_bucket_can_reuse,
1865 .xlo_get_free_start = ocfs2_xa_bucket_get_free_start,
1866 .xlo_wipe_namevalue = ocfs2_xa_bucket_wipe_namevalue,
1867 .xlo_add_entry = ocfs2_xa_bucket_add_entry,
1868 .xlo_add_namevalue = ocfs2_xa_bucket_add_namevalue,
1869 .xlo_fill_value_buf = ocfs2_xa_bucket_fill_value_buf,
1870};
1871
1872static unsigned int ocfs2_xa_value_clusters(struct ocfs2_xa_loc *loc)
1873{
1874 struct ocfs2_xattr_value_buf vb;
1875
1876 if (ocfs2_xattr_is_local(loc->xl_entry))
1877 return 0;
1878
1879 ocfs2_xa_fill_value_buf(loc, &vb);
1880 return le32_to_cpu(vb.vb_xv->xr_clusters);
1881}
1882
1883static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes,
1884 struct ocfs2_xattr_set_ctxt *ctxt)
1885{
1886 int trunc_rc, access_rc;
1887 struct ocfs2_xattr_value_buf vb;
1888
1889 ocfs2_xa_fill_value_buf(loc, &vb);
1890 trunc_rc = ocfs2_xattr_value_truncate(loc->xl_inode, &vb, bytes,
1891 ctxt);
1892
1893 /*
1894 * The caller of ocfs2_xa_value_truncate() has already called
1895 * ocfs2_xa_journal_access on the loc. However, The truncate code
1896 * calls ocfs2_extend_trans(). This may commit the previous
1897 * transaction and open a new one. If this is a bucket, truncate
1898 * could leave only vb->vb_bh set up for journaling. Meanwhile,
1899 * the caller is expecting to dirty the entire bucket. So we must
1900 * reset the journal work. We do this even if truncate has failed,
1901 * as it could have failed after committing the extend.
1902 */
1903 access_rc = ocfs2_xa_journal_access(ctxt->handle, loc,
1904 OCFS2_JOURNAL_ACCESS_WRITE);
1905
1906 /* Errors in truncate take precedence */
1907 return trunc_rc ? trunc_rc : access_rc;
1908}
1909
1910static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
1911{
1912 int index, count;
1913 struct ocfs2_xattr_header *xh = loc->xl_header;
1914 struct ocfs2_xattr_entry *entry = loc->xl_entry;
1915
1916 ocfs2_xa_wipe_namevalue(loc);
1917 loc->xl_entry = NULL;
1918
1919 le16_add_cpu(&xh->xh_count, -1);
1920 count = le16_to_cpu(xh->xh_count);
1921
1922 /*
1923 * Only zero out the entry if there are more remaining. This is
1924 * important for an empty bucket, as it keeps track of the
1925 * bucket's hash value. It doesn't hurt empty block storage.
1926 */
1927 if (count) {
1928 index = ((char *)entry - (char *)&xh->xh_entries) /
1929 sizeof(struct ocfs2_xattr_entry);
1930 memmove(&xh->xh_entries[index], &xh->xh_entries[index + 1],
1931 (count - index) * sizeof(struct ocfs2_xattr_entry));
1932 memset(&xh->xh_entries[count], 0,
1933 sizeof(struct ocfs2_xattr_entry));
1934 }
1519} 1935}
1520 1936
1521/* 1937/*
1522 * ocfs2_xattr_set_entry() 1938 * If we have a problem adjusting the size of an external value during
1939 * ocfs2_xa_prepare_entry() or ocfs2_xa_remove(), we may have an xattr
1940 * in an intermediate state. For example, the value may be partially
1941 * truncated.
1942 *
1943 * If the value tree hasn't changed, the extend/truncate went nowhere.
1944 * We have nothing to do. The caller can treat it as a straight error.
1523 * 1945 *
1524 * Set extended attribute entry into inode or block. 1946 * If the value tree got partially truncated, we now have a corrupted
1947 * extended attribute. We're going to wipe its entry and leak the
1948 * clusters. Better to leak some storage than leave a corrupt entry.
1525 * 1949 *
1526 * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE, 1950 * If the value tree grew, it obviously didn't grow enough for the
1527 * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(), 1951 * new entry. We're not going to try and reclaim those clusters either.
1528 * then set value in B tree with set_value_outside(). 1952 * If there was already an external value there (orig_clusters != 0),
1953 * the new clusters are attached safely and we can just leave the old
1954 * value in place. If there was no external value there, we remove
1955 * the entry.
1956 *
1957 * This way, the xattr block we store in the journal will be consistent.
1958 * If the size change broke because of the journal, no changes will hit
1959 * disk anyway.
1529 */ 1960 */
1530static int ocfs2_xattr_set_entry(struct inode *inode, 1961static void ocfs2_xa_cleanup_value_truncate(struct ocfs2_xa_loc *loc,
1531 struct ocfs2_xattr_info *xi, 1962 const char *what,
1532 struct ocfs2_xattr_search *xs, 1963 unsigned int orig_clusters)
1533 struct ocfs2_xattr_set_ctxt *ctxt, 1964{
1534 int flag) 1965 unsigned int new_clusters = ocfs2_xa_value_clusters(loc);
1535{ 1966 char *nameval_buf = ocfs2_xa_offset_pointer(loc,
1536 struct ocfs2_xattr_entry *last; 1967 le16_to_cpu(loc->xl_entry->xe_name_offset));
1537 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1968
1538 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 1969 if (new_clusters < orig_clusters) {
1539 size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name); 1970 mlog(ML_ERROR,
1540 size_t size_l = 0; 1971 "Partial truncate while %s xattr %.*s. Leaking "
1541 handle_t *handle = ctxt->handle; 1972 "%u clusters and removing the entry\n",
1542 int free, i, ret; 1973 what, loc->xl_entry->xe_name_len, nameval_buf,
1543 struct ocfs2_xattr_info xi_l = { 1974 orig_clusters - new_clusters);
1544 .name_index = xi->name_index, 1975 ocfs2_xa_remove_entry(loc);
1545 .name = xi->name, 1976 } else if (!orig_clusters) {
1546 .value = xi->value, 1977 mlog(ML_ERROR,
1547 .value_len = xi->value_len, 1978 "Unable to allocate an external value for xattr "
1548 }; 1979 "%.*s safely. Leaking %u clusters and removing the "
1549 struct ocfs2_xattr_value_buf vb = { 1980 "entry\n",
1550 .vb_bh = xs->xattr_bh, 1981 loc->xl_entry->xe_name_len, nameval_buf,
1551 .vb_access = ocfs2_journal_access_di, 1982 new_clusters - orig_clusters);
1552 }; 1983 ocfs2_xa_remove_entry(loc);
1984 } else if (new_clusters > orig_clusters)
1985 mlog(ML_ERROR,
1986 "Unable to grow xattr %.*s safely. %u new clusters "
1987 "have been added, but the value will not be "
1988 "modified\n",
1989 loc->xl_entry->xe_name_len, nameval_buf,
1990 new_clusters - orig_clusters);
1991}
1992
1993static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
1994 struct ocfs2_xattr_set_ctxt *ctxt)
1995{
1996 int rc = 0;
1997 unsigned int orig_clusters;
1998
1999 if (!ocfs2_xattr_is_local(loc->xl_entry)) {
2000 orig_clusters = ocfs2_xa_value_clusters(loc);
2001 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2002 if (rc) {
2003 mlog_errno(rc);
2004 /*
2005 * Since this is remove, we can return 0 if
2006 * ocfs2_xa_cleanup_value_truncate() is going to
2007 * wipe the entry anyway. So we check the
2008 * cluster count as well.
2009 */
2010 if (orig_clusters != ocfs2_xa_value_clusters(loc))
2011 rc = 0;
2012 ocfs2_xa_cleanup_value_truncate(loc, "removing",
2013 orig_clusters);
2014 if (rc)
2015 goto out;
2016 }
2017 }
1553 2018
1554 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 2019 ocfs2_xa_remove_entry(loc);
1555 BUG_ON(xs->xattr_bh == xs->inode_bh);
1556 vb.vb_access = ocfs2_journal_access_xb;
1557 } else
1558 BUG_ON(xs->xattr_bh != xs->inode_bh);
1559 2020
1560 /* Compute min_offs, last and free space. */ 2021out:
1561 last = xs->header->xh_entries; 2022 return rc;
2023}
1562 2024
1563 for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) { 2025static void ocfs2_xa_install_value_root(struct ocfs2_xa_loc *loc)
1564 size_t offs = le16_to_cpu(last->xe_name_offset); 2026{
1565 if (offs < min_offs) 2027 int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
1566 min_offs = offs; 2028 char *nameval_buf;
1567 last += 1;
1568 }
1569 2029
1570 free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP; 2030 nameval_buf = ocfs2_xa_offset_pointer(loc,
1571 if (free < 0) 2031 le16_to_cpu(loc->xl_entry->xe_name_offset));
1572 return -EIO; 2032 memcpy(nameval_buf + name_size, &def_xv, OCFS2_XATTR_ROOT_SIZE);
2033}
1573 2034
1574 if (!xs->not_found) { 2035/*
1575 size_t size = 0; 2036 * Take an existing entry and make it ready for the new value. This
1576 if (ocfs2_xattr_is_local(xs->here)) 2037 * won't allocate space, but it may free space. It should be ready for
1577 size = OCFS2_XATTR_SIZE(name_len) + 2038 * ocfs2_xa_prepare_entry() to finish the work.
1578 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); 2039 */
1579 else 2040static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc,
1580 size = OCFS2_XATTR_SIZE(name_len) + 2041 struct ocfs2_xattr_info *xi,
1581 OCFS2_XATTR_ROOT_SIZE; 2042 struct ocfs2_xattr_set_ctxt *ctxt)
1582 free += (size + sizeof(struct ocfs2_xattr_entry)); 2043{
1583 } 2044 int rc = 0;
1584 /* Check free space in inode or block */ 2045 int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
1585 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2046 unsigned int orig_clusters;
1586 if (free < sizeof(struct ocfs2_xattr_entry) + 2047 char *nameval_buf;
1587 OCFS2_XATTR_SIZE(name_len) + 2048 int xe_local = ocfs2_xattr_is_local(loc->xl_entry);
1588 OCFS2_XATTR_ROOT_SIZE) { 2049 int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE;
1589 ret = -ENOSPC; 2050
1590 goto out; 2051 BUG_ON(OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len) !=
2052 name_size);
2053
2054 nameval_buf = ocfs2_xa_offset_pointer(loc,
2055 le16_to_cpu(loc->xl_entry->xe_name_offset));
2056 if (xe_local) {
2057 memset(nameval_buf + name_size, 0,
2058 namevalue_size_xe(loc->xl_entry) - name_size);
2059 if (!xi_local)
2060 ocfs2_xa_install_value_root(loc);
2061 } else {
2062 orig_clusters = ocfs2_xa_value_clusters(loc);
2063 if (xi_local) {
2064 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2065 if (rc < 0)
2066 mlog_errno(rc);
2067 else
2068 memset(nameval_buf + name_size, 0,
2069 namevalue_size_xe(loc->xl_entry) -
2070 name_size);
2071 } else if (le64_to_cpu(loc->xl_entry->xe_value_size) >
2072 xi->xi_value_len) {
2073 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len,
2074 ctxt);
2075 if (rc < 0)
2076 mlog_errno(rc);
1591 } 2077 }
1592 size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; 2078
1593 xi_l.value = (void *)&def_xv; 2079 if (rc) {
1594 xi_l.value_len = OCFS2_XATTR_ROOT_SIZE; 2080 ocfs2_xa_cleanup_value_truncate(loc, "reusing",
1595 } else if (xi->value) { 2081 orig_clusters);
1596 if (free < sizeof(struct ocfs2_xattr_entry) +
1597 OCFS2_XATTR_SIZE(name_len) +
1598 OCFS2_XATTR_SIZE(xi->value_len)) {
1599 ret = -ENOSPC;
1600 goto out; 2082 goto out;
1601 } 2083 }
1602 } 2084 }
1603 2085
1604 if (!xs->not_found) { 2086 loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
1605 /* For existing extended attribute */ 2087 ocfs2_xattr_set_local(loc->xl_entry, xi_local);
1606 size_t size = OCFS2_XATTR_SIZE(name_len) +
1607 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
1608 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1609 void *val = xs->base + offs;
1610 2088
1611 if (ocfs2_xattr_is_local(xs->here) && size == size_l) { 2089out:
1612 /* Replace existing local xattr with tree root */ 2090 return rc;
1613 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, 2091}
1614 ctxt, &vb, offs);
1615 if (ret < 0)
1616 mlog_errno(ret);
1617 goto out;
1618 } else if (!ocfs2_xattr_is_local(xs->here)) {
1619 /* For existing xattr which has value outside */
1620 vb.vb_xv = (struct ocfs2_xattr_value_root *)
1621 (val + OCFS2_XATTR_SIZE(name_len));
1622 2092
1623 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2093/*
1624 /* 2094 * Prepares loc->xl_entry to receive the new xattr. This includes
1625 * If new value need set outside also, 2095 * properly setting up the name+value pair region. If loc->xl_entry
1626 * first truncate old value to new value, 2096 * already exists, it will take care of modifying it appropriately.
1627 * then set new value with set_value_outside(). 2097 *
1628 */ 2098 * Note that this modifies the data. You did journal_access already,
1629 ret = ocfs2_xattr_value_truncate(inode, 2099 * right?
1630 &vb, 2100 */
1631 xi->value_len, 2101static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
1632 ctxt); 2102 struct ocfs2_xattr_info *xi,
1633 if (ret < 0) { 2103 u32 name_hash,
1634 mlog_errno(ret); 2104 struct ocfs2_xattr_set_ctxt *ctxt)
1635 goto out; 2105{
1636 } 2106 int rc = 0;
2107 unsigned int orig_clusters;
2108 __le64 orig_value_size = 0;
1637 2109
1638 ret = ocfs2_xattr_update_entry(inode, 2110 rc = ocfs2_xa_check_space(loc, xi);
1639 handle, 2111 if (rc)
1640 xi, 2112 goto out;
1641 xs,
1642 &vb,
1643 offs);
1644 if (ret < 0) {
1645 mlog_errno(ret);
1646 goto out;
1647 }
1648 2113
1649 ret = __ocfs2_xattr_set_value_outside(inode, 2114 if (loc->xl_entry) {
1650 handle, 2115 if (ocfs2_xa_can_reuse_entry(loc, xi)) {
1651 &vb, 2116 orig_value_size = loc->xl_entry->xe_value_size;
1652 xi->value, 2117 rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
1653 xi->value_len); 2118 if (rc)
1654 if (ret < 0) 2119 goto out;
1655 mlog_errno(ret); 2120 goto alloc_value;
2121 }
2122
2123 if (!ocfs2_xattr_is_local(loc->xl_entry)) {
2124 orig_clusters = ocfs2_xa_value_clusters(loc);
2125 rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
2126 if (rc) {
2127 mlog_errno(rc);
2128 ocfs2_xa_cleanup_value_truncate(loc,
2129 "overwriting",
2130 orig_clusters);
1656 goto out; 2131 goto out;
1657 } else {
1658 /*
1659 * If new value need set in local,
1660 * just trucate old value to zero.
1661 */
1662 ret = ocfs2_xattr_value_truncate(inode,
1663 &vb,
1664 0,
1665 ctxt);
1666 if (ret < 0)
1667 mlog_errno(ret);
1668 } 2132 }
1669 } 2133 }
2134 ocfs2_xa_wipe_namevalue(loc);
2135 } else
2136 ocfs2_xa_add_entry(loc, name_hash);
2137
2138 /*
2139 * If we get here, we have a blank entry. Fill it. We grow our
2140 * name+value pair back from the end.
2141 */
2142 ocfs2_xa_add_namevalue(loc, xi);
2143 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
2144 ocfs2_xa_install_value_root(loc);
2145
2146alloc_value:
2147 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2148 orig_clusters = ocfs2_xa_value_clusters(loc);
2149 rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
2150 if (rc < 0) {
2151 /*
2152 * If we tried to grow an existing external value,
2153 * ocfs2_xa_cleanuP-value_truncate() is going to
2154 * let it stand. We have to restore its original
2155 * value size.
2156 */
2157 loc->xl_entry->xe_value_size = orig_value_size;
2158 ocfs2_xa_cleanup_value_truncate(loc, "growing",
2159 orig_clusters);
2160 mlog_errno(rc);
2161 }
1670 } 2162 }
1671 2163
1672 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh, 2164out:
2165 return rc;
2166}
2167
2168/*
2169 * Store the value portion of the name+value pair. This will skip
2170 * values that are stored externally. Their tree roots were set up
2171 * by ocfs2_xa_prepare_entry().
2172 */
2173static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc,
2174 struct ocfs2_xattr_info *xi,
2175 struct ocfs2_xattr_set_ctxt *ctxt)
2176{
2177 int rc = 0;
2178 int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
2179 int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
2180 char *nameval_buf;
2181 struct ocfs2_xattr_value_buf vb;
2182
2183 nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
2184 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2185 ocfs2_xa_fill_value_buf(loc, &vb);
2186 rc = __ocfs2_xattr_set_value_outside(loc->xl_inode,
2187 ctxt->handle, &vb,
2188 xi->xi_value,
2189 xi->xi_value_len);
2190 } else
2191 memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len);
2192
2193 return rc;
2194}
2195
2196static int ocfs2_xa_set(struct ocfs2_xa_loc *loc,
2197 struct ocfs2_xattr_info *xi,
2198 struct ocfs2_xattr_set_ctxt *ctxt)
2199{
2200 int ret;
2201 u32 name_hash = ocfs2_xattr_name_hash(loc->xl_inode, xi->xi_name,
2202 xi->xi_name_len);
2203
2204 ret = ocfs2_xa_journal_access(ctxt->handle, loc,
1673 OCFS2_JOURNAL_ACCESS_WRITE); 2205 OCFS2_JOURNAL_ACCESS_WRITE);
1674 if (ret) { 2206 if (ret) {
1675 mlog_errno(ret); 2207 mlog_errno(ret);
1676 goto out; 2208 goto out;
1677 } 2209 }
1678 2210
1679 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1680 ret = vb.vb_access(handle, INODE_CACHE(inode), vb.vb_bh,
1681 OCFS2_JOURNAL_ACCESS_WRITE);
1682 if (ret) {
1683 mlog_errno(ret);
1684 goto out;
1685 }
1686 }
1687
1688 /* 2211 /*
1689 * Set value in local, include set tree root in local. 2212 * From here on out, everything is going to modify the buffer a
1690 * This is the first step for value size >INLINE_SIZE. 2213 * little. Errors are going to leave the xattr header in a
2214 * sane state. Thus, even with errors we dirty the sucker.
1691 */ 2215 */
1692 ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
1693 2216
1694 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 2217 /* Don't worry, we are never called with !xi_value and !xl_entry */
1695 ret = ocfs2_journal_dirty(handle, xs->xattr_bh); 2218 if (!xi->xi_value) {
1696 if (ret < 0) { 2219 ret = ocfs2_xa_remove(loc, ctxt);
1697 mlog_errno(ret); 2220 goto out_dirty;
1698 goto out;
1699 }
1700 } 2221 }
1701 2222
1702 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) && 2223 ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt);
1703 (flag & OCFS2_INLINE_XATTR_FL)) { 2224 if (ret) {
1704 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2225 if (ret != -ENOSPC)
1705 unsigned int xattrsize = osb->s_xattr_inline_size; 2226 mlog_errno(ret);
1706 2227 goto out_dirty;
1707 /*
1708 * Adjust extent record count or inline data size
1709 * to reserve space for extended attribute.
1710 */
1711 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1712 struct ocfs2_inline_data *idata = &di->id2.i_data;
1713 le16_add_cpu(&idata->id_count, -xattrsize);
1714 } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
1715 struct ocfs2_extent_list *el = &di->id2.i_list;
1716 le16_add_cpu(&el->l_count, -(xattrsize /
1717 sizeof(struct ocfs2_extent_rec)));
1718 }
1719 di->i_xattr_inline_size = cpu_to_le16(xattrsize);
1720 } 2228 }
1721 /* Update xattr flag */
1722 spin_lock(&oi->ip_lock);
1723 oi->ip_dyn_features |= flag;
1724 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
1725 spin_unlock(&oi->ip_lock);
1726 2229
1727 ret = ocfs2_journal_dirty(handle, xs->inode_bh); 2230 ret = ocfs2_xa_store_value(loc, xi, ctxt);
1728 if (ret < 0) 2231 if (ret)
1729 mlog_errno(ret); 2232 mlog_errno(ret);
1730 2233
1731 if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 2234out_dirty:
1732 /* 2235 ocfs2_xa_journal_dirty(ctxt->handle, loc);
1733 * Set value outside in B tree.
1734 * This is the second step for value size > INLINE_SIZE.
1735 */
1736 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1737 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
1738 &vb, offs);
1739 if (ret < 0) {
1740 int ret2;
1741 2236
1742 mlog_errno(ret);
1743 /*
1744 * If set value outside failed, we have to clean
1745 * the junk tree root we have already set in local.
1746 */
1747 ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
1748 xi, xs, &vb, offs);
1749 if (ret2 < 0)
1750 mlog_errno(ret2);
1751 }
1752 }
1753out: 2237out:
1754 return ret; 2238 return ret;
1755} 2239}
1756 2240
2241static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
2242 struct inode *inode,
2243 struct buffer_head *bh,
2244 struct ocfs2_xattr_entry *entry)
2245{
2246 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
2247
2248 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL));
2249
2250 loc->xl_inode = inode;
2251 loc->xl_ops = &ocfs2_xa_block_loc_ops;
2252 loc->xl_storage = bh;
2253 loc->xl_entry = entry;
2254 loc->xl_size = le16_to_cpu(di->i_xattr_inline_size);
2255 loc->xl_header =
2256 (struct ocfs2_xattr_header *)(bh->b_data + bh->b_size -
2257 loc->xl_size);
2258}
2259
2260static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc,
2261 struct inode *inode,
2262 struct buffer_head *bh,
2263 struct ocfs2_xattr_entry *entry)
2264{
2265 struct ocfs2_xattr_block *xb =
2266 (struct ocfs2_xattr_block *)bh->b_data;
2267
2268 BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED);
2269
2270 loc->xl_inode = inode;
2271 loc->xl_ops = &ocfs2_xa_block_loc_ops;
2272 loc->xl_storage = bh;
2273 loc->xl_header = &(xb->xb_attrs.xb_header);
2274 loc->xl_entry = entry;
2275 loc->xl_size = bh->b_size - offsetof(struct ocfs2_xattr_block,
2276 xb_attrs.xb_header);
2277}
2278
2279static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc,
2280 struct ocfs2_xattr_bucket *bucket,
2281 struct ocfs2_xattr_entry *entry)
2282{
2283 loc->xl_inode = bucket->bu_inode;
2284 loc->xl_ops = &ocfs2_xa_bucket_loc_ops;
2285 loc->xl_storage = bucket;
2286 loc->xl_header = bucket_xh(bucket);
2287 loc->xl_entry = entry;
2288 loc->xl_size = OCFS2_XATTR_BUCKET_SIZE;
2289}
2290
1757/* 2291/*
1758 * In xattr remove, if it is stored outside and refcounted, we may have 2292 * In xattr remove, if it is stored outside and refcounted, we may have
1759 * the chance to split the refcount tree. So need the allocators. 2293 * the chance to split the refcount tree. So need the allocators.
@@ -2149,6 +2683,55 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
2149 return 0; 2683 return 0;
2150} 2684}
2151 2685
2686static int ocfs2_xattr_ibody_init(struct inode *inode,
2687 struct buffer_head *di_bh,
2688 struct ocfs2_xattr_set_ctxt *ctxt)
2689{
2690 int ret;
2691 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2692 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2693 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2694 unsigned int xattrsize = osb->s_xattr_inline_size;
2695
2696 if (!ocfs2_xattr_has_space_inline(inode, di)) {
2697 ret = -ENOSPC;
2698 goto out;
2699 }
2700
2701 ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), di_bh,
2702 OCFS2_JOURNAL_ACCESS_WRITE);
2703 if (ret) {
2704 mlog_errno(ret);
2705 goto out;
2706 }
2707
2708 /*
2709 * Adjust extent record count or inline data size
2710 * to reserve space for extended attribute.
2711 */
2712 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
2713 struct ocfs2_inline_data *idata = &di->id2.i_data;
2714 le16_add_cpu(&idata->id_count, -xattrsize);
2715 } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
2716 struct ocfs2_extent_list *el = &di->id2.i_list;
2717 le16_add_cpu(&el->l_count, -(xattrsize /
2718 sizeof(struct ocfs2_extent_rec)));
2719 }
2720 di->i_xattr_inline_size = cpu_to_le16(xattrsize);
2721
2722 spin_lock(&oi->ip_lock);
2723 oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL|OCFS2_HAS_XATTR_FL;
2724 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
2725 spin_unlock(&oi->ip_lock);
2726
2727 ret = ocfs2_journal_dirty(ctxt->handle, di_bh);
2728 if (ret < 0)
2729 mlog_errno(ret);
2730
2731out:
2732 return ret;
2733}
2734
2152/* 2735/*
2153 * ocfs2_xattr_ibody_set() 2736 * ocfs2_xattr_ibody_set()
2154 * 2737 *
@@ -2160,9 +2743,10 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2160 struct ocfs2_xattr_search *xs, 2743 struct ocfs2_xattr_search *xs,
2161 struct ocfs2_xattr_set_ctxt *ctxt) 2744 struct ocfs2_xattr_set_ctxt *ctxt)
2162{ 2745{
2746 int ret;
2163 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2747 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2164 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 2748 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
2165 int ret; 2749 struct ocfs2_xa_loc loc;
2166 2750
2167 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) 2751 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
2168 return -ENOSPC; 2752 return -ENOSPC;
@@ -2175,8 +2759,25 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2175 } 2759 }
2176 } 2760 }
2177 2761
2178 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, 2762 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
2179 (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL)); 2763 ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
2764 if (ret) {
2765 if (ret != -ENOSPC)
2766 mlog_errno(ret);
2767 goto out;
2768 }
2769 }
2770
2771 ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
2772 xs->not_found ? NULL : xs->here);
2773 ret = ocfs2_xa_set(&loc, xi, ctxt);
2774 if (ret) {
2775 if (ret != -ENOSPC)
2776 mlog_errno(ret);
2777 goto out;
2778 }
2779 xs->here = loc.xl_entry;
2780
2180out: 2781out:
2181 up_write(&oi->ip_alloc_sem); 2782 up_write(&oi->ip_alloc_sem);
2182 2783
@@ -2236,12 +2837,11 @@ cleanup:
2236 return ret; 2837 return ret;
2237} 2838}
2238 2839
2239static int ocfs2_create_xattr_block(handle_t *handle, 2840static int ocfs2_create_xattr_block(struct inode *inode,
2240 struct inode *inode,
2241 struct buffer_head *inode_bh, 2841 struct buffer_head *inode_bh,
2242 struct ocfs2_alloc_context *meta_ac, 2842 struct ocfs2_xattr_set_ctxt *ctxt,
2243 struct buffer_head **ret_bh, 2843 int indexed,
2244 int indexed) 2844 struct buffer_head **ret_bh)
2245{ 2845{
2246 int ret; 2846 int ret;
2247 u16 suballoc_bit_start; 2847 u16 suballoc_bit_start;
@@ -2252,14 +2852,14 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2252 struct buffer_head *new_bh = NULL; 2852 struct buffer_head *new_bh = NULL;
2253 struct ocfs2_xattr_block *xblk; 2853 struct ocfs2_xattr_block *xblk;
2254 2854
2255 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh, 2855 ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
2256 OCFS2_JOURNAL_ACCESS_CREATE); 2856 inode_bh, OCFS2_JOURNAL_ACCESS_CREATE);
2257 if (ret < 0) { 2857 if (ret < 0) {
2258 mlog_errno(ret); 2858 mlog_errno(ret);
2259 goto end; 2859 goto end;
2260 } 2860 }
2261 2861
2262 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, 2862 ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1,
2263 &suballoc_bit_start, &num_got, 2863 &suballoc_bit_start, &num_got,
2264 &first_blkno); 2864 &first_blkno);
2265 if (ret < 0) { 2865 if (ret < 0) {
@@ -2270,7 +2870,7 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2270 new_bh = sb_getblk(inode->i_sb, first_blkno); 2870 new_bh = sb_getblk(inode->i_sb, first_blkno);
2271 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); 2871 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
2272 2872
2273 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), 2873 ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
2274 new_bh, 2874 new_bh,
2275 OCFS2_JOURNAL_ACCESS_CREATE); 2875 OCFS2_JOURNAL_ACCESS_CREATE);
2276 if (ret < 0) { 2876 if (ret < 0) {
@@ -2282,11 +2882,10 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2282 xblk = (struct ocfs2_xattr_block *)new_bh->b_data; 2882 xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
2283 memset(xblk, 0, inode->i_sb->s_blocksize); 2883 memset(xblk, 0, inode->i_sb->s_blocksize);
2284 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); 2884 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
2285 xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num); 2885 xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
2286 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); 2886 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
2287 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); 2887 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
2288 xblk->xb_blkno = cpu_to_le64(first_blkno); 2888 xblk->xb_blkno = cpu_to_le64(first_blkno);
2289
2290 if (indexed) { 2889 if (indexed) {
2291 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; 2890 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
2292 xr->xt_clusters = cpu_to_le32(1); 2891 xr->xt_clusters = cpu_to_le32(1);
@@ -2297,14 +2896,17 @@ static int ocfs2_create_xattr_block(handle_t *handle,
2297 xr->xt_list.l_next_free_rec = cpu_to_le16(1); 2896 xr->xt_list.l_next_free_rec = cpu_to_le16(1);
2298 xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED); 2897 xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
2299 } 2898 }
2899 ocfs2_journal_dirty(ctxt->handle, new_bh);
2300 2900
2301 ret = ocfs2_journal_dirty(handle, new_bh); 2901 /* Add it to the inode */
2302 if (ret < 0) {
2303 mlog_errno(ret);
2304 goto end;
2305 }
2306 di->i_xattr_loc = cpu_to_le64(first_blkno); 2902 di->i_xattr_loc = cpu_to_le64(first_blkno);
2307 ocfs2_journal_dirty(handle, inode_bh); 2903
2904 spin_lock(&OCFS2_I(inode)->ip_lock);
2905 OCFS2_I(inode)->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
2906 di->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
2907 spin_unlock(&OCFS2_I(inode)->ip_lock);
2908
2909 ocfs2_journal_dirty(ctxt->handle, inode_bh);
2308 2910
2309 *ret_bh = new_bh; 2911 *ret_bh = new_bh;
2310 new_bh = NULL; 2912 new_bh = NULL;
@@ -2326,13 +2928,13 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2326 struct ocfs2_xattr_set_ctxt *ctxt) 2928 struct ocfs2_xattr_set_ctxt *ctxt)
2327{ 2929{
2328 struct buffer_head *new_bh = NULL; 2930 struct buffer_head *new_bh = NULL;
2329 handle_t *handle = ctxt->handle;
2330 struct ocfs2_xattr_block *xblk = NULL; 2931 struct ocfs2_xattr_block *xblk = NULL;
2331 int ret; 2932 int ret;
2933 struct ocfs2_xa_loc loc;
2332 2934
2333 if (!xs->xattr_bh) { 2935 if (!xs->xattr_bh) {
2334 ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh, 2936 ret = ocfs2_create_xattr_block(inode, xs->inode_bh, ctxt,
2335 ctxt->meta_ac, &new_bh, 0); 2937 0, &new_bh);
2336 if (ret) { 2938 if (ret) {
2337 mlog_errno(ret); 2939 mlog_errno(ret);
2338 goto end; 2940 goto end;
@@ -2348,21 +2950,25 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2348 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; 2950 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
2349 2951
2350 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { 2952 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
2351 /* Set extended attribute into external block */ 2953 ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh,
2352 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, 2954 xs->not_found ? NULL : xs->here);
2353 OCFS2_HAS_XATTR_FL);
2354 if (!ret || ret != -ENOSPC)
2355 goto end;
2356 2955
2357 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); 2956 ret = ocfs2_xa_set(&loc, xi, ctxt);
2358 if (ret) 2957 if (!ret)
2958 xs->here = loc.xl_entry;
2959 else if (ret != -ENOSPC)
2359 goto end; 2960 goto end;
2961 else {
2962 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
2963 if (ret)
2964 goto end;
2965 }
2360 } 2966 }
2361 2967
2362 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt); 2968 if (le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)
2969 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
2363 2970
2364end: 2971end:
2365
2366 return ret; 2972 return ret;
2367} 2973}
2368 2974
@@ -2371,7 +2977,6 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
2371 struct ocfs2_xattr_info *xi, 2977 struct ocfs2_xattr_info *xi,
2372 struct ocfs2_xattr_search *xs) 2978 struct ocfs2_xattr_search *xs)
2373{ 2979{
2374 u64 value_size;
2375 struct ocfs2_xattr_entry *last; 2980 struct ocfs2_xattr_entry *last;
2376 int free, i; 2981 int free, i;
2377 size_t min_offs = xs->end - xs->base; 2982 size_t min_offs = xs->end - xs->base;
@@ -2394,13 +2999,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
2394 2999
2395 BUG_ON(!xs->not_found); 3000 BUG_ON(!xs->not_found);
2396 3001
2397 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) 3002 if (free >= (sizeof(struct ocfs2_xattr_entry) + namevalue_size_xi(xi)))
2398 value_size = OCFS2_XATTR_ROOT_SIZE;
2399 else
2400 value_size = OCFS2_XATTR_SIZE(xi->value_len);
2401
2402 if (free >= sizeof(struct ocfs2_xattr_entry) +
2403 OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
2404 return 1; 3003 return 1;
2405 3004
2406 return 0; 3005 return 0;
@@ -2424,7 +3023,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2424 char *base = NULL; 3023 char *base = NULL;
2425 int name_offset, name_len = 0; 3024 int name_offset, name_len = 0;
2426 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, 3025 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
2427 xi->value_len); 3026 xi->xi_value_len);
2428 u64 value_size; 3027 u64 value_size;
2429 3028
2430 /* 3029 /*
@@ -2432,14 +3031,14 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2432 * No matter whether we replace an old one or add a new one, 3031 * No matter whether we replace an old one or add a new one,
2433 * we need this for writing. 3032 * we need this for writing.
2434 */ 3033 */
2435 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) 3034 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
2436 credits += new_clusters * 3035 credits += new_clusters *
2437 ocfs2_clusters_to_blocks(inode->i_sb, 1); 3036 ocfs2_clusters_to_blocks(inode->i_sb, 1);
2438 3037
2439 if (xis->not_found && xbs->not_found) { 3038 if (xis->not_found && xbs->not_found) {
2440 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); 3039 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2441 3040
2442 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 3041 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2443 clusters_add += new_clusters; 3042 clusters_add += new_clusters;
2444 credits += ocfs2_calc_extend_credits(inode->i_sb, 3043 credits += ocfs2_calc_extend_credits(inode->i_sb,
2445 &def_xv.xv.xr_list, 3044 &def_xv.xv.xr_list,
@@ -2484,7 +3083,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2484 * The credits for removing the value tree will be extended 3083 * The credits for removing the value tree will be extended
2485 * by ocfs2_remove_extent itself. 3084 * by ocfs2_remove_extent itself.
2486 */ 3085 */
2487 if (!xi->value) { 3086 if (!xi->xi_value) {
2488 if (!ocfs2_xattr_is_local(xe)) 3087 if (!ocfs2_xattr_is_local(xe))
2489 credits += ocfs2_remove_extent_credits(inode->i_sb); 3088 credits += ocfs2_remove_extent_credits(inode->i_sb);
2490 3089
@@ -2514,7 +3113,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2514 } 3113 }
2515 } 3114 }
2516 3115
2517 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 3116 if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
2518 /* the new values will be stored outside. */ 3117 /* the new values will be stored outside. */
2519 u32 old_clusters = 0; 3118 u32 old_clusters = 0;
2520 3119
@@ -2547,9 +3146,10 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2547 * value, we don't need any allocation, otherwise we have 3146 * value, we don't need any allocation, otherwise we have
2548 * to guess metadata allocation. 3147 * to guess metadata allocation.
2549 */ 3148 */
2550 if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) || 3149 if ((ocfs2_xattr_is_local(xe) &&
3150 (value_size >= xi->xi_value_len)) ||
2551 (!ocfs2_xattr_is_local(xe) && 3151 (!ocfs2_xattr_is_local(xe) &&
2552 OCFS2_XATTR_ROOT_SIZE >= xi->value_len)) 3152 OCFS2_XATTR_ROOT_SIZE >= xi->xi_value_len))
2553 goto out; 3153 goto out;
2554 } 3154 }
2555 3155
@@ -2639,7 +3239,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
2639 3239
2640 meta_add += extra_meta; 3240 meta_add += extra_meta;
2641 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, " 3241 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
2642 "credits = %d\n", xi->name, meta_add, clusters_add, *credits); 3242 "credits = %d\n", xi->xi_name, meta_add, clusters_add, *credits);
2643 3243
2644 if (meta_add) { 3244 if (meta_add) {
2645 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, 3245 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
@@ -2679,7 +3279,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2679{ 3279{
2680 int ret = 0, credits, old_found; 3280 int ret = 0, credits, old_found;
2681 3281
2682 if (!xi->value) { 3282 if (!xi->xi_value) {
2683 /* Remove existing extended attribute */ 3283 /* Remove existing extended attribute */
2684 if (!xis->not_found) 3284 if (!xis->not_found)
2685 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt); 3285 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
@@ -2693,8 +3293,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2693 * If succeed and that extended attribute existing in 3293 * If succeed and that extended attribute existing in
2694 * external block, then we will remove it. 3294 * external block, then we will remove it.
2695 */ 3295 */
2696 xi->value = NULL; 3296 xi->xi_value = NULL;
2697 xi->value_len = 0; 3297 xi->xi_value_len = 0;
2698 3298
2699 old_found = xis->not_found; 3299 old_found = xis->not_found;
2700 xis->not_found = -ENODATA; 3300 xis->not_found = -ENODATA;
@@ -2722,8 +3322,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2722 } else if (ret == -ENOSPC) { 3322 } else if (ret == -ENOSPC) {
2723 if (di->i_xattr_loc && !xbs->xattr_bh) { 3323 if (di->i_xattr_loc && !xbs->xattr_bh) {
2724 ret = ocfs2_xattr_block_find(inode, 3324 ret = ocfs2_xattr_block_find(inode,
2725 xi->name_index, 3325 xi->xi_name_index,
2726 xi->name, xbs); 3326 xi->xi_name, xbs);
2727 if (ret) 3327 if (ret)
2728 goto out; 3328 goto out;
2729 3329
@@ -2762,8 +3362,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2762 * If succeed and that extended attribute 3362 * If succeed and that extended attribute
2763 * existing in inode, we will remove it. 3363 * existing in inode, we will remove it.
2764 */ 3364 */
2765 xi->value = NULL; 3365 xi->xi_value = NULL;
2766 xi->value_len = 0; 3366 xi->xi_value_len = 0;
2767 xbs->not_found = -ENODATA; 3367 xbs->not_found = -ENODATA;
2768 ret = ocfs2_calc_xattr_set_need(inode, 3368 ret = ocfs2_calc_xattr_set_need(inode,
2769 di, 3369 di,
@@ -2829,10 +3429,11 @@ int ocfs2_xattr_set_handle(handle_t *handle,
2829 int ret; 3429 int ret;
2830 3430
2831 struct ocfs2_xattr_info xi = { 3431 struct ocfs2_xattr_info xi = {
2832 .name_index = name_index, 3432 .xi_name_index = name_index,
2833 .name = name, 3433 .xi_name = name,
2834 .value = value, 3434 .xi_name_len = strlen(name),
2835 .value_len = value_len, 3435 .xi_value = value,
3436 .xi_value_len = value_len,
2836 }; 3437 };
2837 3438
2838 struct ocfs2_xattr_search xis = { 3439 struct ocfs2_xattr_search xis = {
@@ -2912,10 +3513,11 @@ int ocfs2_xattr_set(struct inode *inode,
2912 struct ocfs2_refcount_tree *ref_tree = NULL; 3513 struct ocfs2_refcount_tree *ref_tree = NULL;
2913 3514
2914 struct ocfs2_xattr_info xi = { 3515 struct ocfs2_xattr_info xi = {
2915 .name_index = name_index, 3516 .xi_name_index = name_index,
2916 .name = name, 3517 .xi_name = name,
2917 .value = value, 3518 .xi_name_len = strlen(name),
2918 .value_len = value_len, 3519 .xi_value = value,
3520 .xi_value_len = value_len,
2919 }; 3521 };
2920 3522
2921 struct ocfs2_xattr_search xis = { 3523 struct ocfs2_xattr_search xis = {
@@ -3759,7 +4361,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
3759 struct ocfs2_xattr_bucket *bucket) 4361 struct ocfs2_xattr_bucket *bucket)
3760{ 4362{
3761 int ret, i; 4363 int ret, i;
3762 size_t end, offset, len, value_len; 4364 size_t end, offset, len;
3763 struct ocfs2_xattr_header *xh; 4365 struct ocfs2_xattr_header *xh;
3764 char *entries, *buf, *bucket_buf = NULL; 4366 char *entries, *buf, *bucket_buf = NULL;
3765 u64 blkno = bucket_blkno(bucket); 4367 u64 blkno = bucket_blkno(bucket);
@@ -3813,12 +4415,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
3813 end = OCFS2_XATTR_BUCKET_SIZE; 4415 end = OCFS2_XATTR_BUCKET_SIZE;
3814 for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) { 4416 for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
3815 offset = le16_to_cpu(xe->xe_name_offset); 4417 offset = le16_to_cpu(xe->xe_name_offset);
3816 if (ocfs2_xattr_is_local(xe)) 4418 len = namevalue_size_xe(xe);
3817 value_len = OCFS2_XATTR_SIZE(
3818 le64_to_cpu(xe->xe_value_size));
3819 else
3820 value_len = OCFS2_XATTR_ROOT_SIZE;
3821 len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
3822 4419
3823 /* 4420 /*
3824 * We must make sure that the name/value pair 4421 * We must make sure that the name/value pair
@@ -4007,7 +4604,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4007 int new_bucket_head) 4604 int new_bucket_head)
4008{ 4605{
4009 int ret, i; 4606 int ret, i;
4010 int count, start, len, name_value_len = 0, xe_len, name_offset = 0; 4607 int count, start, len, name_value_len = 0, name_offset = 0;
4011 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL; 4608 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
4012 struct ocfs2_xattr_header *xh; 4609 struct ocfs2_xattr_header *xh;
4013 struct ocfs2_xattr_entry *xe; 4610 struct ocfs2_xattr_entry *xe;
@@ -4098,13 +4695,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4098 name_value_len = 0; 4695 name_value_len = 0;
4099 for (i = 0; i < start; i++) { 4696 for (i = 0; i < start; i++) {
4100 xe = &xh->xh_entries[i]; 4697 xe = &xh->xh_entries[i];
4101 xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len); 4698 name_value_len += namevalue_size_xe(xe);
4102 if (ocfs2_xattr_is_local(xe))
4103 xe_len +=
4104 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4105 else
4106 xe_len += OCFS2_XATTR_ROOT_SIZE;
4107 name_value_len += xe_len;
4108 if (le16_to_cpu(xe->xe_name_offset) < name_offset) 4699 if (le16_to_cpu(xe->xe_name_offset) < name_offset)
4109 name_offset = le16_to_cpu(xe->xe_name_offset); 4700 name_offset = le16_to_cpu(xe->xe_name_offset);
4110 } 4701 }
@@ -4134,12 +4725,6 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4134 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); 4725 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
4135 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { 4726 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
4136 xe = &xh->xh_entries[i]; 4727 xe = &xh->xh_entries[i];
4137 xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
4138 if (ocfs2_xattr_is_local(xe))
4139 xe_len +=
4140 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4141 else
4142 xe_len += OCFS2_XATTR_ROOT_SIZE;
4143 if (le16_to_cpu(xe->xe_name_offset) < 4728 if (le16_to_cpu(xe->xe_name_offset) <
4144 le16_to_cpu(xh->xh_free_start)) 4729 le16_to_cpu(xh->xh_free_start))
4145 xh->xh_free_start = xe->xe_name_offset; 4730 xh->xh_free_start = xe->xe_name_offset;
@@ -4751,195 +5336,6 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
4751} 5336}
4752 5337
4753/* 5338/*
4754 * Handle the normal xattr set, including replace, delete and new.
4755 *
4756 * Note: "local" indicates the real data's locality. So we can't
4757 * just its bucket locality by its length.
4758 */
4759static void ocfs2_xattr_set_entry_normal(struct inode *inode,
4760 struct ocfs2_xattr_info *xi,
4761 struct ocfs2_xattr_search *xs,
4762 u32 name_hash,
4763 int local)
4764{
4765 struct ocfs2_xattr_entry *last, *xe;
4766 int name_len = strlen(xi->name);
4767 struct ocfs2_xattr_header *xh = xs->header;
4768 u16 count = le16_to_cpu(xh->xh_count), start;
4769 size_t blocksize = inode->i_sb->s_blocksize;
4770 char *val;
4771 size_t offs, size, new_size;
4772
4773 last = &xh->xh_entries[count];
4774 if (!xs->not_found) {
4775 xe = xs->here;
4776 offs = le16_to_cpu(xe->xe_name_offset);
4777 if (ocfs2_xattr_is_local(xe))
4778 size = OCFS2_XATTR_SIZE(name_len) +
4779 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4780 else
4781 size = OCFS2_XATTR_SIZE(name_len) +
4782 OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
4783
4784 /*
4785 * If the new value will be stored outside, xi->value has been
4786 * initalized as an empty ocfs2_xattr_value_root, and the same
4787 * goes with xi->value_len, so we can set new_size safely here.
4788 * See ocfs2_xattr_set_in_bucket.
4789 */
4790 new_size = OCFS2_XATTR_SIZE(name_len) +
4791 OCFS2_XATTR_SIZE(xi->value_len);
4792
4793 le16_add_cpu(&xh->xh_name_value_len, -size);
4794 if (xi->value) {
4795 if (new_size > size)
4796 goto set_new_name_value;
4797
4798 /* Now replace the old value with new one. */
4799 if (local)
4800 xe->xe_value_size = cpu_to_le64(xi->value_len);
4801 else
4802 xe->xe_value_size = 0;
4803
4804 val = ocfs2_xattr_bucket_get_val(inode,
4805 xs->bucket, offs);
4806 memset(val + OCFS2_XATTR_SIZE(name_len), 0,
4807 size - OCFS2_XATTR_SIZE(name_len));
4808 if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
4809 memcpy(val + OCFS2_XATTR_SIZE(name_len),
4810 xi->value, xi->value_len);
4811
4812 le16_add_cpu(&xh->xh_name_value_len, new_size);
4813 ocfs2_xattr_set_local(xe, local);
4814 return;
4815 } else {
4816 /*
4817 * Remove the old entry if there is more than one.
4818 * We don't remove the last entry so that we can
4819 * use it to indicate the hash value of the empty
4820 * bucket.
4821 */
4822 last -= 1;
4823 le16_add_cpu(&xh->xh_count, -1);
4824 if (xh->xh_count) {
4825 memmove(xe, xe + 1,
4826 (void *)last - (void *)xe);
4827 memset(last, 0,
4828 sizeof(struct ocfs2_xattr_entry));
4829 } else
4830 xh->xh_free_start =
4831 cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
4832
4833 return;
4834 }
4835 } else {
4836 /* find a new entry for insert. */
4837 int low = 0, high = count - 1, tmp;
4838 struct ocfs2_xattr_entry *tmp_xe;
4839
4840 while (low <= high && count) {
4841 tmp = (low + high) / 2;
4842 tmp_xe = &xh->xh_entries[tmp];
4843
4844 if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
4845 low = tmp + 1;
4846 else if (name_hash <
4847 le32_to_cpu(tmp_xe->xe_name_hash))
4848 high = tmp - 1;
4849 else {
4850 low = tmp;
4851 break;
4852 }
4853 }
4854
4855 xe = &xh->xh_entries[low];
4856 if (low != count)
4857 memmove(xe + 1, xe, (void *)last - (void *)xe);
4858
4859 le16_add_cpu(&xh->xh_count, 1);
4860 memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
4861 xe->xe_name_hash = cpu_to_le32(name_hash);
4862 xe->xe_name_len = name_len;
4863 ocfs2_xattr_set_type(xe, xi->name_index);
4864 }
4865
4866set_new_name_value:
4867 /* Insert the new name+value. */
4868 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
4869
4870 /*
4871 * We must make sure that the name/value pair
4872 * exists in the same block.
4873 */
4874 offs = le16_to_cpu(xh->xh_free_start);
4875 start = offs - size;
4876
4877 if (start >> inode->i_sb->s_blocksize_bits !=
4878 (offs - 1) >> inode->i_sb->s_blocksize_bits) {
4879 offs = offs - offs % blocksize;
4880 xh->xh_free_start = cpu_to_le16(offs);
4881 }
4882
4883 val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
4884 xe->xe_name_offset = cpu_to_le16(offs - size);
4885
4886 memset(val, 0, size);
4887 memcpy(val, xi->name, name_len);
4888 memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
4889
4890 xe->xe_value_size = cpu_to_le64(xi->value_len);
4891 ocfs2_xattr_set_local(xe, local);
4892 xs->here = xe;
4893 le16_add_cpu(&xh->xh_free_start, -size);
4894 le16_add_cpu(&xh->xh_name_value_len, size);
4895
4896 return;
4897}
4898
4899/*
4900 * Set the xattr entry in the specified bucket.
4901 * The bucket is indicated by xs->bucket and it should have the enough
4902 * space for the xattr insertion.
4903 */
4904static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
4905 handle_t *handle,
4906 struct ocfs2_xattr_info *xi,
4907 struct ocfs2_xattr_search *xs,
4908 u32 name_hash,
4909 int local)
4910{
4911 int ret;
4912 u64 blkno;
4913
4914 mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
4915 (unsigned long)xi->value_len, xi->name_index,
4916 (unsigned long long)bucket_blkno(xs->bucket));
4917
4918 if (!xs->bucket->bu_bhs[1]) {
4919 blkno = bucket_blkno(xs->bucket);
4920 ocfs2_xattr_bucket_relse(xs->bucket);
4921 ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
4922 if (ret) {
4923 mlog_errno(ret);
4924 goto out;
4925 }
4926 }
4927
4928 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
4929 OCFS2_JOURNAL_ACCESS_WRITE);
4930 if (ret < 0) {
4931 mlog_errno(ret);
4932 goto out;
4933 }
4934
4935 ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
4936 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
4937
4938out:
4939 return ret;
4940}
4941
4942/*
4943 * Truncate the specified xe_off entry in xattr bucket. 5339 * Truncate the specified xe_off entry in xattr bucket.
4944 * bucket is indicated by header_bh and len is the new length. 5340 * bucket is indicated by header_bh and len is the new length.
4945 * Both the ocfs2_xattr_value_root and the entry will be updated here. 5341 * Both the ocfs2_xattr_value_root and the entry will be updated here.
@@ -5009,66 +5405,6 @@ out:
5009 return ret; 5405 return ret;
5010} 5406}
5011 5407
5012static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
5013 struct ocfs2_xattr_search *xs,
5014 int len,
5015 struct ocfs2_xattr_set_ctxt *ctxt)
5016{
5017 int ret, offset;
5018 struct ocfs2_xattr_entry *xe = xs->here;
5019 struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
5020
5021 BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
5022
5023 offset = xe - xh->xh_entries;
5024 ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
5025 offset, len, ctxt);
5026 if (ret)
5027 mlog_errno(ret);
5028
5029 return ret;
5030}
5031
5032static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
5033 handle_t *handle,
5034 struct ocfs2_xattr_search *xs,
5035 char *val,
5036 int value_len)
5037{
5038 int ret, offset, block_off;
5039 struct ocfs2_xattr_value_root *xv;
5040 struct ocfs2_xattr_entry *xe = xs->here;
5041 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
5042 void *base;
5043 struct ocfs2_xattr_value_buf vb = {
5044 .vb_access = ocfs2_journal_access,
5045 };
5046
5047 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
5048
5049 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh,
5050 xe - xh->xh_entries,
5051 &block_off,
5052 &offset);
5053 if (ret) {
5054 mlog_errno(ret);
5055 goto out;
5056 }
5057
5058 base = bucket_block(xs->bucket, block_off);
5059 xv = (struct ocfs2_xattr_value_root *)(base + offset +
5060 OCFS2_XATTR_SIZE(xe->xe_name_len));
5061
5062 vb.vb_xv = xv;
5063 vb.vb_bh = xs->bucket->bu_bhs[block_off];
5064 ret = __ocfs2_xattr_set_value_outside(inode, handle,
5065 &vb, val, value_len);
5066 if (ret)
5067 mlog_errno(ret);
5068out:
5069 return ret;
5070}
5071
5072static int ocfs2_rm_xattr_cluster(struct inode *inode, 5408static int ocfs2_rm_xattr_cluster(struct inode *inode,
5073 struct buffer_head *root_bh, 5409 struct buffer_head *root_bh,
5074 u64 blkno, 5410 u64 blkno,
@@ -5167,128 +5503,6 @@ out:
5167 return ret; 5503 return ret;
5168} 5504}
5169 5505
5170static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
5171 handle_t *handle,
5172 struct ocfs2_xattr_search *xs)
5173{
5174 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
5175 struct ocfs2_xattr_entry *last = &xh->xh_entries[
5176 le16_to_cpu(xh->xh_count) - 1];
5177 int ret = 0;
5178
5179 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
5180 OCFS2_JOURNAL_ACCESS_WRITE);
5181 if (ret) {
5182 mlog_errno(ret);
5183 return;
5184 }
5185
5186 /* Remove the old entry. */
5187 memmove(xs->here, xs->here + 1,
5188 (void *)last - (void *)xs->here);
5189 memset(last, 0, sizeof(struct ocfs2_xattr_entry));
5190 le16_add_cpu(&xh->xh_count, -1);
5191
5192 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
5193}
5194
5195/*
5196 * Set the xattr name/value in the bucket specified in xs.
5197 *
5198 * As the new value in xi may be stored in the bucket or in an outside cluster,
5199 * we divide the whole process into 3 steps:
5200 * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
5201 * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
5202 * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
5203 * 4. If the clusters for the new outside value can't be allocated, we need
5204 * to free the xattr we allocated in set.
5205 */
5206static int ocfs2_xattr_set_in_bucket(struct inode *inode,
5207 struct ocfs2_xattr_info *xi,
5208 struct ocfs2_xattr_search *xs,
5209 struct ocfs2_xattr_set_ctxt *ctxt)
5210{
5211 int ret, local = 1;
5212 size_t value_len;
5213 char *val = (char *)xi->value;
5214 struct ocfs2_xattr_entry *xe = xs->here;
5215 u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
5216 strlen(xi->name));
5217
5218 if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
5219 /*
5220 * We need to truncate the xattr storage first.
5221 *
5222 * If both the old and new value are stored to
5223 * outside block, we only need to truncate
5224 * the storage and then set the value outside.
5225 *
5226 * If the new value should be stored within block,
5227 * we should free all the outside block first and
5228 * the modification to the xattr block will be done
5229 * by following steps.
5230 */
5231 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
5232 value_len = xi->value_len;
5233 else
5234 value_len = 0;
5235
5236 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
5237 value_len,
5238 ctxt);
5239 if (ret)
5240 goto out;
5241
5242 if (value_len)
5243 goto set_value_outside;
5244 }
5245
5246 value_len = xi->value_len;
5247 /* So we have to handle the inside block change now. */
5248 if (value_len > OCFS2_XATTR_INLINE_SIZE) {
5249 /*
5250 * If the new value will be stored outside of block,
5251 * initalize a new empty value root and insert it first.
5252 */
5253 local = 0;
5254 xi->value = &def_xv;
5255 xi->value_len = OCFS2_XATTR_ROOT_SIZE;
5256 }
5257
5258 ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
5259 name_hash, local);
5260 if (ret) {
5261 mlog_errno(ret);
5262 goto out;
5263 }
5264
5265 if (value_len <= OCFS2_XATTR_INLINE_SIZE)
5266 goto out;
5267
5268 /* allocate the space now for the outside block storage. */
5269 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
5270 value_len, ctxt);
5271 if (ret) {
5272 mlog_errno(ret);
5273
5274 if (xs->not_found) {
5275 /*
5276 * We can't allocate enough clusters for outside
5277 * storage and we have allocated xattr already,
5278 * so need to remove it.
5279 */
5280 ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
5281 }
5282 goto out;
5283 }
5284
5285set_value_outside:
5286 ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
5287 xs, val, value_len);
5288out:
5289 return ret;
5290}
5291
5292/* 5506/*
5293 * check whether the xattr bucket is filled up with the same hash value. 5507 * check whether the xattr bucket is filled up with the same hash value.
5294 * If we want to insert the xattr with the same hash, return -ENOSPC. 5508 * If we want to insert the xattr with the same hash, return -ENOSPC.
@@ -5317,156 +5531,116 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
5317 return 0; 5531 return 0;
5318} 5532}
5319 5533
5320static int ocfs2_xattr_set_entry_index_block(struct inode *inode, 5534/*
5321 struct ocfs2_xattr_info *xi, 5535 * Try to set the entry in the current bucket. If we fail, the caller
5322 struct ocfs2_xattr_search *xs, 5536 * will handle getting us another bucket.
5323 struct ocfs2_xattr_set_ctxt *ctxt) 5537 */
5538static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
5539 struct ocfs2_xattr_info *xi,
5540 struct ocfs2_xattr_search *xs,
5541 struct ocfs2_xattr_set_ctxt *ctxt)
5324{ 5542{
5325 struct ocfs2_xattr_header *xh; 5543 int ret;
5326 struct ocfs2_xattr_entry *xe; 5544 struct ocfs2_xa_loc loc;
5327 u16 count, header_size, xh_free_start;
5328 int free, max_free, need, old;
5329 size_t value_size = 0, name_len = strlen(xi->name);
5330 size_t blocksize = inode->i_sb->s_blocksize;
5331 int ret, allocation = 0;
5332
5333 mlog_entry("Set xattr %s in xattr index block\n", xi->name);
5334
5335try_again:
5336 xh = xs->header;
5337 count = le16_to_cpu(xh->xh_count);
5338 xh_free_start = le16_to_cpu(xh->xh_free_start);
5339 header_size = sizeof(struct ocfs2_xattr_header) +
5340 count * sizeof(struct ocfs2_xattr_entry);
5341 max_free = OCFS2_XATTR_BUCKET_SIZE - header_size -
5342 le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP;
5343
5344 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
5345 "of %u which exceed block size\n",
5346 (unsigned long long)bucket_blkno(xs->bucket),
5347 header_size);
5348 5545
5349 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) 5546 mlog_entry("Set xattr %s in xattr bucket\n", xi->xi_name);
5350 value_size = OCFS2_XATTR_ROOT_SIZE;
5351 else if (xi->value)
5352 value_size = OCFS2_XATTR_SIZE(xi->value_len);
5353 5547
5354 if (xs->not_found) 5548 ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
5355 need = sizeof(struct ocfs2_xattr_entry) + 5549 xs->not_found ? NULL : xs->here);
5356 OCFS2_XATTR_SIZE(name_len) + value_size; 5550 ret = ocfs2_xa_set(&loc, xi, ctxt);
5357 else { 5551 if (!ret) {
5358 need = value_size + OCFS2_XATTR_SIZE(name_len); 5552 xs->here = loc.xl_entry;
5553 goto out;
5554 }
5555 if (ret != -ENOSPC) {
5556 mlog_errno(ret);
5557 goto out;
5558 }
5359 5559
5360 /* 5560 /* Ok, we need space. Let's try defragmenting the bucket. */
5361 * We only replace the old value if the new length is smaller 5561 ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
5362 * than the old one. Otherwise we will allocate new space in the 5562 xs->bucket);
5363 * bucket to store it. 5563 if (ret) {
5364 */ 5564 mlog_errno(ret);
5365 xe = xs->here; 5565 goto out;
5366 if (ocfs2_xattr_is_local(xe)) 5566 }
5367 old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
5368 else
5369 old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
5370 5567
5371 if (old >= value_size) 5568 ret = ocfs2_xa_set(&loc, xi, ctxt);
5372 need = 0; 5569 if (!ret) {
5570 xs->here = loc.xl_entry;
5571 goto out;
5373 } 5572 }
5573 if (ret != -ENOSPC)
5574 mlog_errno(ret);
5374 5575
5375 free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP;
5376 /*
5377 * We need to make sure the new name/value pair
5378 * can exist in the same block.
5379 */
5380 if (xh_free_start % blocksize < need)
5381 free -= xh_free_start % blocksize;
5382
5383 mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
5384 "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
5385 " %u\n", xs->not_found,
5386 (unsigned long long)bucket_blkno(xs->bucket),
5387 free, need, max_free, le16_to_cpu(xh->xh_free_start),
5388 le16_to_cpu(xh->xh_name_value_len));
5389
5390 if (free < need ||
5391 (xs->not_found &&
5392 count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
5393 if (need <= max_free &&
5394 count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
5395 /*
5396 * We can create the space by defragment. Since only the
5397 * name/value will be moved, the xe shouldn't be changed
5398 * in xs.
5399 */
5400 ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
5401 xs->bucket);
5402 if (ret) {
5403 mlog_errno(ret);
5404 goto out;
5405 }
5406 5576
5407 xh_free_start = le16_to_cpu(xh->xh_free_start); 5577out:
5408 free = xh_free_start - header_size 5578 mlog_exit(ret);
5409 - OCFS2_XATTR_HEADER_GAP; 5579 return ret;
5410 if (xh_free_start % blocksize < need) 5580}
5411 free -= xh_free_start % blocksize;
5412 5581
5413 if (free >= need) 5582static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
5414 goto xattr_set; 5583 struct ocfs2_xattr_info *xi,
5584 struct ocfs2_xattr_search *xs,
5585 struct ocfs2_xattr_set_ctxt *ctxt)
5586{
5587 int ret;
5415 5588
5416 mlog(0, "Can't get enough space for xattr insert by " 5589 mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name);
5417 "defragment. Need %u bytes, but we have %d, so "
5418 "allocate new bucket for it.\n", need, free);
5419 }
5420 5590
5421 /* 5591 ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
5422 * We have to add new buckets or clusters and one 5592 if (!ret)
5423 * allocation should leave us enough space for insert. 5593 goto out;
5424 */ 5594 if (ret != -ENOSPC) {
5425 BUG_ON(allocation); 5595 mlog_errno(ret);
5596 goto out;
5597 }
5426 5598
5427 /* 5599 /* Ack, need more space. Let's try to get another bucket! */
5428 * We do not allow for overlapping ranges between buckets. And
5429 * the maximum number of collisions we will allow for then is
5430 * one bucket's worth, so check it here whether we need to
5431 * add a new bucket for the insert.
5432 */
5433 ret = ocfs2_check_xattr_bucket_collision(inode,
5434 xs->bucket,
5435 xi->name);
5436 if (ret) {
5437 mlog_errno(ret);
5438 goto out;
5439 }
5440 5600
5441 ret = ocfs2_add_new_xattr_bucket(inode, 5601 /*
5442 xs->xattr_bh, 5602 * We do not allow for overlapping ranges between buckets. And
5603 * the maximum number of collisions we will allow for then is
5604 * one bucket's worth, so check it here whether we need to
5605 * add a new bucket for the insert.
5606 */
5607 ret = ocfs2_check_xattr_bucket_collision(inode,
5443 xs->bucket, 5608 xs->bucket,
5444 ctxt); 5609 xi->xi_name);
5445 if (ret) { 5610 if (ret) {
5446 mlog_errno(ret); 5611 mlog_errno(ret);
5447 goto out; 5612 goto out;
5448 } 5613 }
5449 5614
5450 /* 5615 ret = ocfs2_add_new_xattr_bucket(inode,
5451 * ocfs2_add_new_xattr_bucket() will have updated 5616 xs->xattr_bh,
5452 * xs->bucket if it moved, but it will not have updated 5617 xs->bucket,
5453 * any of the other search fields. Thus, we drop it and 5618 ctxt);
5454 * re-search. Everything should be cached, so it'll be 5619 if (ret) {
5455 * quick. 5620 mlog_errno(ret);
5456 */ 5621 goto out;
5457 ocfs2_xattr_bucket_relse(xs->bucket);
5458 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
5459 xi->name_index,
5460 xi->name, xs);
5461 if (ret && ret != -ENODATA)
5462 goto out;
5463 xs->not_found = ret;
5464 allocation = 1;
5465 goto try_again;
5466 } 5622 }
5467 5623
5468xattr_set: 5624 /*
5469 ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt); 5625 * ocfs2_add_new_xattr_bucket() will have updated
5626 * xs->bucket if it moved, but it will not have updated
5627 * any of the other search fields. Thus, we drop it and
5628 * re-search. Everything should be cached, so it'll be
5629 * quick.
5630 */
5631 ocfs2_xattr_bucket_relse(xs->bucket);
5632 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
5633 xi->xi_name_index,
5634 xi->xi_name, xs);
5635 if (ret && ret != -ENODATA)
5636 goto out;
5637 xs->not_found = ret;
5638
5639 /* Ok, we have a new bucket, let's try again */
5640 ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
5641 if (ret && (ret != -ENOSPC))
5642 mlog_errno(ret);
5643
5470out: 5644out:
5471 mlog_exit(ret); 5645 mlog_exit(ret);
5472 return ret; 5646 return ret;
@@ -5678,7 +5852,7 @@ static int ocfs2_prepare_refcount_xattr(struct inode *inode,
5678 * refcount tree, and make the original extent become 3. So we will need 5852 * refcount tree, and make the original extent become 3. So we will need
5679 * 2 * cluster more extent recs at most. 5853 * 2 * cluster more extent recs at most.
5680 */ 5854 */
5681 if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) { 5855 if (!xi->xi_value || xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) {
5682 5856
5683 ret = ocfs2_refcounted_xattr_delete_need(inode, 5857 ret = ocfs2_refcounted_xattr_delete_need(inode,
5684 &(*ref_tree)->rf_ci, 5858 &(*ref_tree)->rf_ci,
@@ -6354,33 +6528,33 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
6354 int indexed) 6528 int indexed)
6355{ 6529{
6356 int ret; 6530 int ret;
6357 handle_t *handle;
6358 struct ocfs2_alloc_context *meta_ac;
6359 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 6531 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6532 struct ocfs2_xattr_set_ctxt ctxt;
6360 6533
6361 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); 6534 memset(&ctxt, 0, sizeof(ctxt));
6535 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
6362 if (ret < 0) { 6536 if (ret < 0) {
6363 mlog_errno(ret); 6537 mlog_errno(ret);
6364 return ret; 6538 return ret;
6365 } 6539 }
6366 6540
6367 handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS); 6541 ctxt.handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
6368 if (IS_ERR(handle)) { 6542 if (IS_ERR(ctxt.handle)) {
6369 ret = PTR_ERR(handle); 6543 ret = PTR_ERR(ctxt.handle);
6370 mlog_errno(ret); 6544 mlog_errno(ret);
6371 goto out; 6545 goto out;
6372 } 6546 }
6373 6547
6374 mlog(0, "create new xattr block for inode %llu, index = %d\n", 6548 mlog(0, "create new xattr block for inode %llu, index = %d\n",
6375 (unsigned long long)fe_bh->b_blocknr, indexed); 6549 (unsigned long long)fe_bh->b_blocknr, indexed);
6376 ret = ocfs2_create_xattr_block(handle, inode, fe_bh, 6550 ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
6377 meta_ac, ret_bh, indexed); 6551 ret_bh);
6378 if (ret) 6552 if (ret)
6379 mlog_errno(ret); 6553 mlog_errno(ret);
6380 6554
6381 ocfs2_commit_trans(osb, handle); 6555 ocfs2_commit_trans(osb, ctxt.handle);
6382out: 6556out:
6383 ocfs2_free_alloc_context(meta_ac); 6557 ocfs2_free_alloc_context(ctxt.meta_ac);
6384 return ret; 6558 return ret;
6385} 6559}
6386 6560
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index f3b7c1541f3a..c82af6acc2e7 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -6,11 +6,13 @@
6#include <linux/version.h> 6#include <linux/version.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/slab.h>
9#include <linux/fs.h> 10#include <linux/fs.h>
10#include <linux/vfs.h> 11#include <linux/vfs.h>
11#include <linux/parser.h> 12#include <linux/parser.h>
12#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
13#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/writeback.h>
14#include <linux/crc-itu-t.h> 16#include <linux/crc-itu-t.h>
15#include "omfs.h" 17#include "omfs.h"
16 18
@@ -89,7 +91,7 @@ static void omfs_update_checksums(struct omfs_inode *oi)
89 oi->i_head.h_check_xor = xor; 91 oi->i_head.h_check_xor = xor;
90} 92}
91 93
92static int omfs_write_inode(struct inode *inode, int wait) 94static int __omfs_write_inode(struct inode *inode, int wait)
93{ 95{
94 struct omfs_inode *oi; 96 struct omfs_inode *oi;
95 struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); 97 struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
@@ -162,9 +164,14 @@ out:
162 return ret; 164 return ret;
163} 165}
164 166
167static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc)
168{
169 return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
170}
171
165int omfs_sync_inode(struct inode *inode) 172int omfs_sync_inode(struct inode *inode)
166{ 173{
167 return omfs_write_inode(inode, 1); 174 return __omfs_write_inode(inode, 1);
168} 175}
169 176
170/* 177/*
diff --git a/fs/open.c b/fs/open.c
index 040cef72bc00..74e5cd9f718e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -8,10 +8,8 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/file.h> 9#include <linux/file.h>
10#include <linux/fdtable.h> 10#include <linux/fdtable.h>
11#include <linux/quotaops.h>
12#include <linux/fsnotify.h> 11#include <linux/fsnotify.h>
13#include <linux/module.h> 12#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/tty.h> 13#include <linux/tty.h>
16#include <linux/namei.h> 14#include <linux/namei.h>
17#include <linux/backing-dev.h> 15#include <linux/backing-dev.h>
@@ -21,6 +19,7 @@
21#include <linux/mount.h> 19#include <linux/mount.h>
22#include <linux/vfs.h> 20#include <linux/vfs.h>
23#include <linux/fcntl.h> 21#include <linux/fcntl.h>
22#include <linux/slab.h>
24#include <asm/uaccess.h> 23#include <asm/uaccess.h>
25#include <linux/fs.h> 24#include <linux/fs.h>
26#include <linux/personality.h> 25#include <linux/personality.h>
@@ -271,17 +270,15 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
271 * Make sure that there are no leases. get_write_access() protects 270 * Make sure that there are no leases. get_write_access() protects
272 * against the truncate racing with a lease-granting setlease(). 271 * against the truncate racing with a lease-granting setlease().
273 */ 272 */
274 error = break_lease(inode, FMODE_WRITE); 273 error = break_lease(inode, O_WRONLY);
275 if (error) 274 if (error)
276 goto put_write_and_out; 275 goto put_write_and_out;
277 276
278 error = locks_verify_truncate(inode, NULL, length); 277 error = locks_verify_truncate(inode, NULL, length);
279 if (!error) 278 if (!error)
280 error = security_path_truncate(&path, length, 0); 279 error = security_path_truncate(&path, length, 0);
281 if (!error) { 280 if (!error)
282 vfs_dq_init(inode);
283 error = do_truncate(path.dentry, length, 0, NULL); 281 error = do_truncate(path.dentry, length, 0, NULL);
284 }
285 282
286put_write_and_out: 283put_write_and_out:
287 put_write_access(inode); 284 put_write_access(inode);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 64bc8998ac9a..e238ab23a9e7 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -16,6 +16,7 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/slab.h>
19#include <linux/kmod.h> 20#include <linux/kmod.h>
20#include <linux/ctype.h> 21#include <linux/ctype.h>
21#include <linux/genhd.h> 22#include <linux/genhd.h>
@@ -412,9 +413,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
412 pdev = part_to_dev(p); 413 pdev = part_to_dev(p);
413 414
414 p->start_sect = start; 415 p->start_sect = start;
415 p->alignment_offset = queue_sector_alignment_offset(disk->queue, start); 416 p->alignment_offset =
416 p->discard_alignment = queue_sector_discard_alignment(disk->queue, 417 queue_limit_alignment_offset(&disk->queue->limits, start);
417 start); 418 p->discard_alignment =
419 queue_limit_discard_alignment(&disk->queue->limits, start);
418 p->nr_sects = len; 420 p->nr_sects = len;
419 p->partno = partno; 421 p->partno = partno;
420 p->policy = get_disk_ro(disk); 422 p->policy = get_disk_ro(disk);
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 49cfd5f54238..91babdae7587 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -95,6 +95,7 @@
95 ************************************************************/ 95 ************************************************************/
96#include <linux/crc32.h> 96#include <linux/crc32.h>
97#include <linux/math64.h> 97#include <linux/math64.h>
98#include <linux/slab.h>
98#include "check.h" 99#include "check.h"
99#include "efi.h" 100#include "efi.h"
100 101
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 0028d2ef0662..90be97f1f5a8 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -31,14 +31,17 @@
31 */ 31 */
32#include <asm/unaligned.h> 32#include <asm/unaligned.h>
33 33
34#define SYS_IND(p) (get_unaligned(&p->sys_ind)) 34#define SYS_IND(p) get_unaligned(&p->sys_ind)
35#define NR_SECTS(p) ({ __le32 __a = get_unaligned(&p->nr_sects); \
36 le32_to_cpu(__a); \
37 })
38 35
39#define START_SECT(p) ({ __le32 __a = get_unaligned(&p->start_sect); \ 36static inline sector_t nr_sects(struct partition *p)
40 le32_to_cpu(__a); \ 37{
41 }) 38 return (sector_t)get_unaligned_le32(&p->nr_sects);
39}
40
41static inline sector_t start_sect(struct partition *p)
42{
43 return (sector_t)get_unaligned_le32(&p->start_sect);
44}
42 45
43static inline int is_extended_partition(struct partition *p) 46static inline int is_extended_partition(struct partition *p)
44{ 47{
@@ -104,13 +107,13 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
104 107
105static void 108static void
106parse_extended(struct parsed_partitions *state, struct block_device *bdev, 109parse_extended(struct parsed_partitions *state, struct block_device *bdev,
107 u32 first_sector, u32 first_size) 110 sector_t first_sector, sector_t first_size)
108{ 111{
109 struct partition *p; 112 struct partition *p;
110 Sector sect; 113 Sector sect;
111 unsigned char *data; 114 unsigned char *data;
112 u32 this_sector, this_size; 115 sector_t this_sector, this_size;
113 int sector_size = bdev_logical_block_size(bdev) / 512; 116 sector_t sector_size = bdev_logical_block_size(bdev) / 512;
114 int loopct = 0; /* number of links followed 117 int loopct = 0; /* number of links followed
115 without finding a data partition */ 118 without finding a data partition */
116 int i; 119 int i;
@@ -145,14 +148,14 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
145 * First process the data partition(s) 148 * First process the data partition(s)
146 */ 149 */
147 for (i=0; i<4; i++, p++) { 150 for (i=0; i<4; i++, p++) {
148 u32 offs, size, next; 151 sector_t offs, size, next;
149 if (!NR_SECTS(p) || is_extended_partition(p)) 152 if (!nr_sects(p) || is_extended_partition(p))
150 continue; 153 continue;
151 154
152 /* Check the 3rd and 4th entries - 155 /* Check the 3rd and 4th entries -
153 these sometimes contain random garbage */ 156 these sometimes contain random garbage */
154 offs = START_SECT(p)*sector_size; 157 offs = start_sect(p)*sector_size;
155 size = NR_SECTS(p)*sector_size; 158 size = nr_sects(p)*sector_size;
156 next = this_sector + offs; 159 next = this_sector + offs;
157 if (i >= 2) { 160 if (i >= 2) {
158 if (offs + size > this_size) 161 if (offs + size > this_size)
@@ -179,13 +182,13 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
179 */ 182 */
180 p -= 4; 183 p -= 4;
181 for (i=0; i<4; i++, p++) 184 for (i=0; i<4; i++, p++)
182 if (NR_SECTS(p) && is_extended_partition(p)) 185 if (nr_sects(p) && is_extended_partition(p))
183 break; 186 break;
184 if (i == 4) 187 if (i == 4)
185 goto done; /* nothing left to do */ 188 goto done; /* nothing left to do */
186 189
187 this_sector = first_sector + START_SECT(p) * sector_size; 190 this_sector = first_sector + start_sect(p) * sector_size;
188 this_size = NR_SECTS(p) * sector_size; 191 this_size = nr_sects(p) * sector_size;
189 put_dev_sector(sect); 192 put_dev_sector(sect);
190 } 193 }
191done: 194done:
@@ -197,7 +200,7 @@ done:
197 200
198static void 201static void
199parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, 202parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
200 u32 offset, u32 size, int origin) 203 sector_t offset, sector_t size, int origin)
201{ 204{
202#ifdef CONFIG_SOLARIS_X86_PARTITION 205#ifdef CONFIG_SOLARIS_X86_PARTITION
203 Sector sect; 206 Sector sect;
@@ -244,7 +247,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
244 */ 247 */
245static void 248static void
246parse_bsd(struct parsed_partitions *state, struct block_device *bdev, 249parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
247 u32 offset, u32 size, int origin, char *flavour, 250 sector_t offset, sector_t size, int origin, char *flavour,
248 int max_partitions) 251 int max_partitions)
249{ 252{
250 Sector sect; 253 Sector sect;
@@ -263,7 +266,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
263 if (le16_to_cpu(l->d_npartitions) < max_partitions) 266 if (le16_to_cpu(l->d_npartitions) < max_partitions)
264 max_partitions = le16_to_cpu(l->d_npartitions); 267 max_partitions = le16_to_cpu(l->d_npartitions);
265 for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) { 268 for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
266 u32 bsd_start, bsd_size; 269 sector_t bsd_start, bsd_size;
267 270
268 if (state->next == state->limit) 271 if (state->next == state->limit)
269 break; 272 break;
@@ -290,7 +293,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
290 293
291static void 294static void
292parse_freebsd(struct parsed_partitions *state, struct block_device *bdev, 295parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
293 u32 offset, u32 size, int origin) 296 sector_t offset, sector_t size, int origin)
294{ 297{
295#ifdef CONFIG_BSD_DISKLABEL 298#ifdef CONFIG_BSD_DISKLABEL
296 parse_bsd(state, bdev, offset, size, origin, 299 parse_bsd(state, bdev, offset, size, origin,
@@ -300,7 +303,7 @@ parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
300 303
301static void 304static void
302parse_netbsd(struct parsed_partitions *state, struct block_device *bdev, 305parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
303 u32 offset, u32 size, int origin) 306 sector_t offset, sector_t size, int origin)
304{ 307{
305#ifdef CONFIG_BSD_DISKLABEL 308#ifdef CONFIG_BSD_DISKLABEL
306 parse_bsd(state, bdev, offset, size, origin, 309 parse_bsd(state, bdev, offset, size, origin,
@@ -310,7 +313,7 @@ parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
310 313
311static void 314static void
312parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, 315parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
313 u32 offset, u32 size, int origin) 316 sector_t offset, sector_t size, int origin)
314{ 317{
315#ifdef CONFIG_BSD_DISKLABEL 318#ifdef CONFIG_BSD_DISKLABEL
316 parse_bsd(state, bdev, offset, size, origin, 319 parse_bsd(state, bdev, offset, size, origin,
@@ -324,7 +327,7 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
324 */ 327 */
325static void 328static void
326parse_unixware(struct parsed_partitions *state, struct block_device *bdev, 329parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
327 u32 offset, u32 size, int origin) 330 sector_t offset, sector_t size, int origin)
328{ 331{
329#ifdef CONFIG_UNIXWARE_DISKLABEL 332#ifdef CONFIG_UNIXWARE_DISKLABEL
330 Sector sect; 333 Sector sect;
@@ -348,7 +351,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
348 351
349 if (p->s_label != UNIXWARE_FS_UNUSED) 352 if (p->s_label != UNIXWARE_FS_UNUSED)
350 put_partition(state, state->next++, 353 put_partition(state, state->next++,
351 START_SECT(p), NR_SECTS(p)); 354 le32_to_cpu(p->start_sect),
355 le32_to_cpu(p->nr_sects));
352 p++; 356 p++;
353 } 357 }
354 put_dev_sector(sect); 358 put_dev_sector(sect);
@@ -363,7 +367,7 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
363 */ 367 */
364static void 368static void
365parse_minix(struct parsed_partitions *state, struct block_device *bdev, 369parse_minix(struct parsed_partitions *state, struct block_device *bdev,
366 u32 offset, u32 size, int origin) 370 sector_t offset, sector_t size, int origin)
367{ 371{
368#ifdef CONFIG_MINIX_SUBPARTITION 372#ifdef CONFIG_MINIX_SUBPARTITION
369 Sector sect; 373 Sector sect;
@@ -390,7 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
390 /* add each partition in use */ 394 /* add each partition in use */
391 if (SYS_IND(p) == MINIX_PARTITION) 395 if (SYS_IND(p) == MINIX_PARTITION)
392 put_partition(state, state->next++, 396 put_partition(state, state->next++,
393 START_SECT(p), NR_SECTS(p)); 397 start_sect(p), nr_sects(p));
394 } 398 }
395 printk(" >\n"); 399 printk(" >\n");
396 } 400 }
@@ -401,7 +405,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
401static struct { 405static struct {
402 unsigned char id; 406 unsigned char id;
403 void (*parse)(struct parsed_partitions *, struct block_device *, 407 void (*parse)(struct parsed_partitions *, struct block_device *,
404 u32, u32, int); 408 sector_t, sector_t, int);
405} subtypes[] = { 409} subtypes[] = {
406 {FREEBSD_PARTITION, parse_freebsd}, 410 {FREEBSD_PARTITION, parse_freebsd},
407 {NETBSD_PARTITION, parse_netbsd}, 411 {NETBSD_PARTITION, parse_netbsd},
@@ -415,7 +419,7 @@ static struct {
415 419
416int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) 420int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
417{ 421{
418 int sector_size = bdev_logical_block_size(bdev) / 512; 422 sector_t sector_size = bdev_logical_block_size(bdev) / 512;
419 Sector sect; 423 Sector sect;
420 unsigned char *data; 424 unsigned char *data;
421 struct partition *p; 425 struct partition *p;
@@ -483,14 +487,21 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
483 487
484 state->next = 5; 488 state->next = 5;
485 for (slot = 1 ; slot <= 4 ; slot++, p++) { 489 for (slot = 1 ; slot <= 4 ; slot++, p++) {
486 u32 start = START_SECT(p)*sector_size; 490 sector_t start = start_sect(p)*sector_size;
487 u32 size = NR_SECTS(p)*sector_size; 491 sector_t size = nr_sects(p)*sector_size;
488 if (!size) 492 if (!size)
489 continue; 493 continue;
490 if (is_extended_partition(p)) { 494 if (is_extended_partition(p)) {
491 /* prevent someone doing mkfs or mkswap on an 495 /*
492 extended partition, but leave room for LILO */ 496 * prevent someone doing mkfs or mkswap on an
493 put_partition(state, slot, start, size == 1 ? 1 : 2); 497 * extended partition, but leave room for LILO
498 * FIXME: this uses one logical sector for > 512b
499 * sector, although it may not be enough/proper.
500 */
501 sector_t n = 2;
502 n = min(size, max(sector_size, n));
503 put_partition(state, slot, start, n);
504
494 printk(" <"); 505 printk(" <");
495 parse_extended(state, bdev, start, size); 506 parse_extended(state, bdev, start, size);
496 printk(" >"); 507 printk(" >");
@@ -513,7 +524,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
513 unsigned char id = SYS_IND(p); 524 unsigned char id = SYS_IND(p);
514 int n; 525 int n;
515 526
516 if (!NR_SECTS(p)) 527 if (!nr_sects(p))
517 continue; 528 continue;
518 529
519 for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++) 530 for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
@@ -521,8 +532,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
521 532
522 if (!subtypes[n].parse) 533 if (!subtypes[n].parse)
523 continue; 534 continue;
524 subtypes[n].parse(state, bdev, START_SECT(p)*sector_size, 535 subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
525 NR_SECTS(p)*sector_size, slot); 536 nr_sects(p)*sector_size, slot);
526 } 537 }
527 put_dev_sector(sect); 538 put_dev_sector(sect);
528 return 1; 539 return 1;
diff --git a/fs/pnode.c b/fs/pnode.c
index 8d5f392ec3d3..5cc564a83149 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -86,7 +86,7 @@ static int do_make_slave(struct vfsmount *mnt)
86 86
87 /* 87 /*
88 * slave 'mnt' to a peer mount that has the 88 * slave 'mnt' to a peer mount that has the
89 * same root dentry. If none is available than 89 * same root dentry. If none is available then
90 * slave it to anything that is available. 90 * slave it to anything that is available.
91 */ 91 */
92 while ((peer_mnt = next_peer(peer_mnt)) != mnt && 92 while ((peer_mnt = next_peer(peer_mnt)) != mnt &&
@@ -147,6 +147,11 @@ void change_mnt_propagation(struct vfsmount *mnt, int type)
147 * get the next mount in the propagation tree. 147 * get the next mount in the propagation tree.
148 * @m: the mount seen last 148 * @m: the mount seen last
149 * @origin: the original mount from where the tree walk initiated 149 * @origin: the original mount from where the tree walk initiated
150 *
151 * Note that peer groups form contiguous segments of slave lists.
152 * We rely on that in get_source() to be able to find out if
153 * vfsmount found while iterating with propagation_next() is
154 * a peer of one we'd found earlier.
150 */ 155 */
151static struct vfsmount *propagation_next(struct vfsmount *m, 156static struct vfsmount *propagation_next(struct vfsmount *m,
152 struct vfsmount *origin) 157 struct vfsmount *origin)
@@ -186,10 +191,6 @@ static struct vfsmount *get_source(struct vfsmount *dest,
186{ 191{
187 struct vfsmount *p_last_src = NULL; 192 struct vfsmount *p_last_src = NULL;
188 struct vfsmount *p_last_dest = NULL; 193 struct vfsmount *p_last_dest = NULL;
189 *type = CL_PROPAGATION;
190
191 if (IS_MNT_SHARED(dest))
192 *type |= CL_MAKE_SHARED;
193 194
194 while (last_dest != dest->mnt_master) { 195 while (last_dest != dest->mnt_master) {
195 p_last_dest = last_dest; 196 p_last_dest = last_dest;
@@ -202,13 +203,18 @@ static struct vfsmount *get_source(struct vfsmount *dest,
202 do { 203 do {
203 p_last_dest = next_peer(p_last_dest); 204 p_last_dest = next_peer(p_last_dest);
204 } while (IS_MNT_NEW(p_last_dest)); 205 } while (IS_MNT_NEW(p_last_dest));
206 /* is that a peer of the earlier? */
207 if (dest == p_last_dest) {
208 *type = CL_MAKE_SHARED;
209 return p_last_src;
210 }
205 } 211 }
206 212 /* slave of the earlier, then */
207 if (dest != p_last_dest) { 213 *type = CL_SLAVE;
208 *type |= CL_SLAVE; 214 /* beginning of peer group among the slaves? */
209 return last_src; 215 if (IS_MNT_SHARED(dest))
210 } else 216 *type |= CL_MAKE_SHARED;
211 return p_last_src; 217 return last_src;
212} 218}
213 219
214/* 220/*
diff --git a/fs/pnode.h b/fs/pnode.h
index 958665d662af..1ea4ae1efcd3 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -21,12 +21,11 @@
21#define CL_SLAVE 0x02 21#define CL_SLAVE 0x02
22#define CL_COPY_ALL 0x04 22#define CL_COPY_ALL 0x04
23#define CL_MAKE_SHARED 0x08 23#define CL_MAKE_SHARED 0x08
24#define CL_PROPAGATION 0x10 24#define CL_PRIVATE 0x10
25#define CL_PRIVATE 0x20
26 25
27static inline void set_mnt_shared(struct vfsmount *mnt) 26static inline void set_mnt_shared(struct vfsmount *mnt)
28{ 27{
29 mnt->mnt_flags &= ~MNT_PNODE_MASK; 28 mnt->mnt_flags &= ~MNT_SHARED_MASK;
30 mnt->mnt_flags |= MNT_SHARED; 29 mnt->mnt_flags |= MNT_SHARED;
31} 30}
32 31
diff --git a/fs/proc/array.c b/fs/proc/array.c
index f560325c444f..e51f2ec2c5e5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -68,7 +68,6 @@
68#include <linux/hugetlb.h> 68#include <linux/hugetlb.h>
69#include <linux/pagemap.h> 69#include <linux/pagemap.h>
70#include <linux/swap.h> 70#include <linux/swap.h>
71#include <linux/slab.h>
72#include <linux/smp.h> 71#include <linux/smp.h>
73#include <linux/signal.h> 72#include <linux/signal.h>
74#include <linux/highmem.h> 73#include <linux/highmem.h>
@@ -270,8 +269,10 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
270 blocked = p->blocked; 269 blocked = p->blocked;
271 collect_sigign_sigcatch(p, &ignored, &caught); 270 collect_sigign_sigcatch(p, &ignored, &caught);
272 num_threads = atomic_read(&p->signal->count); 271 num_threads = atomic_read(&p->signal->count);
272 rcu_read_lock(); /* FIXME: is this correct? */
273 qsize = atomic_read(&__task_cred(p)->user->sigpending); 273 qsize = atomic_read(&__task_cred(p)->user->sigpending);
274 qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; 274 rcu_read_unlock();
275 qlim = task_rlimit(p, RLIMIT_SIGPENDING);
275 unlock_task_sighand(p, &flags); 276 unlock_task_sighand(p, &flags);
276 } 277 }
277 278
@@ -327,94 +328,6 @@ static inline void task_context_switch_counts(struct seq_file *m,
327 p->nivcsw); 328 p->nivcsw);
328} 329}
329 330
330#ifdef CONFIG_MMU
331
332struct stack_stats {
333 struct vm_area_struct *vma;
334 unsigned long startpage;
335 unsigned long usage;
336};
337
338static int stack_usage_pte_range(pmd_t *pmd, unsigned long addr,
339 unsigned long end, struct mm_walk *walk)
340{
341 struct stack_stats *ss = walk->private;
342 struct vm_area_struct *vma = ss->vma;
343 pte_t *pte, ptent;
344 spinlock_t *ptl;
345 int ret = 0;
346
347 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
348 for (; addr != end; pte++, addr += PAGE_SIZE) {
349 ptent = *pte;
350
351#ifdef CONFIG_STACK_GROWSUP
352 if (pte_present(ptent) || is_swap_pte(ptent))
353 ss->usage = addr - ss->startpage + PAGE_SIZE;
354#else
355 if (pte_present(ptent) || is_swap_pte(ptent)) {
356 ss->usage = ss->startpage - addr + PAGE_SIZE;
357 pte++;
358 ret = 1;
359 break;
360 }
361#endif
362 }
363 pte_unmap_unlock(pte - 1, ptl);
364 cond_resched();
365 return ret;
366}
367
368static inline unsigned long get_stack_usage_in_bytes(struct vm_area_struct *vma,
369 struct task_struct *task)
370{
371 struct stack_stats ss;
372 struct mm_walk stack_walk = {
373 .pmd_entry = stack_usage_pte_range,
374 .mm = vma->vm_mm,
375 .private = &ss,
376 };
377
378 if (!vma->vm_mm || is_vm_hugetlb_page(vma))
379 return 0;
380
381 ss.vma = vma;
382 ss.startpage = task->stack_start & PAGE_MASK;
383 ss.usage = 0;
384
385#ifdef CONFIG_STACK_GROWSUP
386 walk_page_range(KSTK_ESP(task) & PAGE_MASK, vma->vm_end,
387 &stack_walk);
388#else
389 walk_page_range(vma->vm_start, (KSTK_ESP(task) & PAGE_MASK) + PAGE_SIZE,
390 &stack_walk);
391#endif
392 return ss.usage;
393}
394
395static inline void task_show_stack_usage(struct seq_file *m,
396 struct task_struct *task)
397{
398 struct vm_area_struct *vma;
399 struct mm_struct *mm = get_task_mm(task);
400
401 if (mm) {
402 down_read(&mm->mmap_sem);
403 vma = find_vma(mm, task->stack_start);
404 if (vma)
405 seq_printf(m, "Stack usage:\t%lu kB\n",
406 get_stack_usage_in_bytes(vma, task) >> 10);
407
408 up_read(&mm->mmap_sem);
409 mmput(mm);
410 }
411}
412#else
413static void task_show_stack_usage(struct seq_file *m, struct task_struct *task)
414{
415}
416#endif /* CONFIG_MMU */
417
418static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) 331static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
419{ 332{
420 seq_printf(m, "Cpus_allowed:\t"); 333 seq_printf(m, "Cpus_allowed:\t");
@@ -445,7 +358,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
445 task_show_regs(m, task); 358 task_show_regs(m, task);
446#endif 359#endif
447 task_context_switch_counts(m, task); 360 task_context_switch_counts(m, task);
448 task_show_stack_usage(m, task);
449 return 0; 361 return 0;
450} 362}
451 363
@@ -507,7 +419,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
507 cutime = sig->cutime; 419 cutime = sig->cutime;
508 cstime = sig->cstime; 420 cstime = sig->cstime;
509 cgtime = sig->cgtime; 421 cgtime = sig->cgtime;
510 rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; 422 rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
511 423
512 /* add up live thread stats at the group level */ 424 /* add up live thread stats at the group level */
513 if (whole) { 425 if (whole) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 18d5cc62d8ed..8418fcc0a6ab 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -81,6 +81,7 @@
81#include <linux/elf.h> 81#include <linux/elf.h>
82#include <linux/pid_namespace.h> 82#include <linux/pid_namespace.h>
83#include <linux/fs_struct.h> 83#include <linux/fs_struct.h>
84#include <linux/slab.h>
84#include "internal.h" 85#include "internal.h"
85 86
86/* NOTE: 87/* NOTE:
@@ -442,12 +443,13 @@ static const struct file_operations proc_lstats_operations = {
442unsigned long badness(struct task_struct *p, unsigned long uptime); 443unsigned long badness(struct task_struct *p, unsigned long uptime);
443static int proc_oom_score(struct task_struct *task, char *buffer) 444static int proc_oom_score(struct task_struct *task, char *buffer)
444{ 445{
445 unsigned long points; 446 unsigned long points = 0;
446 struct timespec uptime; 447 struct timespec uptime;
447 448
448 do_posix_clock_monotonic_gettime(&uptime); 449 do_posix_clock_monotonic_gettime(&uptime);
449 read_lock(&tasklist_lock); 450 read_lock(&tasklist_lock);
450 points = badness(task->group_leader, uptime.tv_sec); 451 if (pid_alive(task))
452 points = badness(task, uptime.tv_sec);
451 read_unlock(&tasklist_lock); 453 read_unlock(&tasklist_lock);
452 return sprintf(buffer, "%lu\n", points); 454 return sprintf(buffer, "%lu\n", points);
453} 455}
@@ -647,17 +649,11 @@ static int mounts_release(struct inode *inode, struct file *file)
647static unsigned mounts_poll(struct file *file, poll_table *wait) 649static unsigned mounts_poll(struct file *file, poll_table *wait)
648{ 650{
649 struct proc_mounts *p = file->private_data; 651 struct proc_mounts *p = file->private_data;
650 struct mnt_namespace *ns = p->ns;
651 unsigned res = POLLIN | POLLRDNORM; 652 unsigned res = POLLIN | POLLRDNORM;
652 653
653 poll_wait(file, &ns->poll, wait); 654 poll_wait(file, &p->ns->poll, wait);
654 655 if (mnt_had_events(p))
655 spin_lock(&vfsmount_lock);
656 if (p->event != ns->event) {
657 p->event = ns->event;
658 res |= POLLERR | POLLPRI; 656 res |= POLLERR | POLLPRI;
659 }
660 spin_unlock(&vfsmount_lock);
661 657
662 return res; 658 return res;
663} 659}
@@ -1095,8 +1091,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1095 if (!capable(CAP_AUDIT_CONTROL)) 1091 if (!capable(CAP_AUDIT_CONTROL))
1096 return -EPERM; 1092 return -EPERM;
1097 1093
1098 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) 1094 rcu_read_lock();
1095 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
1096 rcu_read_unlock();
1099 return -EPERM; 1097 return -EPERM;
1098 }
1099 rcu_read_unlock();
1100 1100
1101 if (count >= PAGE_SIZE) 1101 if (count >= PAGE_SIZE)
1102 count = PAGE_SIZE - 1; 1102 count = PAGE_SIZE - 1;
@@ -1419,7 +1419,6 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
1419 goto out; 1419 goto out;
1420 1420
1421 error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); 1421 error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
1422 nd->last_type = LAST_BIND;
1423out: 1422out:
1424 return ERR_PTR(error); 1423 return ERR_PTR(error);
1425} 1424}
@@ -2370,16 +2369,30 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
2370{ 2369{
2371 struct pid_namespace *ns = dentry->d_sb->s_fs_info; 2370 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
2372 pid_t tgid = task_tgid_nr_ns(current, ns); 2371 pid_t tgid = task_tgid_nr_ns(current, ns);
2373 char tmp[PROC_NUMBUF]; 2372 char *name = ERR_PTR(-ENOENT);
2374 if (!tgid) 2373 if (tgid) {
2375 return ERR_PTR(-ENOENT); 2374 name = __getname();
2376 sprintf(tmp, "%d", task_tgid_nr_ns(current, ns)); 2375 if (!name)
2377 return ERR_PTR(vfs_follow_link(nd,tmp)); 2376 name = ERR_PTR(-ENOMEM);
2377 else
2378 sprintf(name, "%d", tgid);
2379 }
2380 nd_set_link(nd, name);
2381 return NULL;
2382}
2383
2384static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
2385 void *cookie)
2386{
2387 char *s = nd_get_link(nd);
2388 if (!IS_ERR(s))
2389 __putname(s);
2378} 2390}
2379 2391
2380static const struct inode_operations proc_self_inode_operations = { 2392static const struct inode_operations proc_self_inode_operations = {
2381 .readlink = proc_self_readlink, 2393 .readlink = proc_self_readlink,
2382 .follow_link = proc_self_follow_link, 2394 .follow_link = proc_self_follow_link,
2395 .put_link = proc_self_put_link,
2383}; 2396};
2384 2397
2385/* 2398/*
@@ -2896,7 +2909,7 @@ out_no_task:
2896 */ 2909 */
2897static const struct pid_entry tid_base_stuff[] = { 2910static const struct pid_entry tid_base_stuff[] = {
2898 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), 2911 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2899 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations), 2912 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2900 REG("environ", S_IRUSR, proc_environ_operations), 2913 REG("environ", S_IRUSR, proc_environ_operations),
2901 INF("auxv", S_IRUSR, proc_pid_auxv), 2914 INF("auxv", S_IRUSR, proc_pid_auxv),
2902 ONE("status", S_IRUGO, proc_pid_status), 2915 ONE("status", S_IRUGO, proc_pid_status),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 480cb1065eec..43c127490606 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -13,6 +13,7 @@
13#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h>
16#include <linux/mount.h> 17#include <linux/mount.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/idr.h> 19#include <linux/idr.h>
@@ -291,19 +292,17 @@ static const struct inode_operations proc_file_inode_operations = {
291 * returns the struct proc_dir_entry for "/proc/tty/driver", and 292 * returns the struct proc_dir_entry for "/proc/tty/driver", and
292 * returns "serial" in residual. 293 * returns "serial" in residual.
293 */ 294 */
294static int xlate_proc_name(const char *name, 295static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
295 struct proc_dir_entry **ret, const char **residual) 296 const char **residual)
296{ 297{
297 const char *cp = name, *next; 298 const char *cp = name, *next;
298 struct proc_dir_entry *de; 299 struct proc_dir_entry *de;
299 int len; 300 int len;
300 int rtn = 0;
301 301
302 de = *ret; 302 de = *ret;
303 if (!de) 303 if (!de)
304 de = &proc_root; 304 de = &proc_root;
305 305
306 spin_lock(&proc_subdir_lock);
307 while (1) { 306 while (1) {
308 next = strchr(cp, '/'); 307 next = strchr(cp, '/');
309 if (!next) 308 if (!next)
@@ -315,16 +314,25 @@ static int xlate_proc_name(const char *name,
315 break; 314 break;
316 } 315 }
317 if (!de) { 316 if (!de) {
318 rtn = -ENOENT; 317 WARN(1, "name '%s'\n", name);
319 goto out; 318 return -ENOENT;
320 } 319 }
321 cp += len + 1; 320 cp += len + 1;
322 } 321 }
323 *residual = cp; 322 *residual = cp;
324 *ret = de; 323 *ret = de;
325out: 324 return 0;
325}
326
327static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
328 const char **residual)
329{
330 int rv;
331
332 spin_lock(&proc_subdir_lock);
333 rv = __xlate_proc_name(name, ret, residual);
326 spin_unlock(&proc_subdir_lock); 334 spin_unlock(&proc_subdir_lock);
327 return rtn; 335 return rv;
328} 336}
329 337
330static DEFINE_IDA(proc_inum_ida); 338static DEFINE_IDA(proc_inum_ida);
@@ -662,6 +670,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
662 } 670 }
663 return ent; 671 return ent;
664} 672}
673EXPORT_SYMBOL(proc_symlink);
665 674
666struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, 675struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
667 struct proc_dir_entry *parent) 676 struct proc_dir_entry *parent)
@@ -700,6 +709,7 @@ struct proc_dir_entry *proc_mkdir(const char *name,
700{ 709{
701 return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent); 710 return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
702} 711}
712EXPORT_SYMBOL(proc_mkdir);
703 713
704struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, 714struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
705 struct proc_dir_entry *parent) 715 struct proc_dir_entry *parent)
@@ -728,6 +738,7 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
728 } 738 }
729 return ent; 739 return ent;
730} 740}
741EXPORT_SYMBOL(create_proc_entry);
731 742
732struct proc_dir_entry *proc_create_data(const char *name, mode_t mode, 743struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
733 struct proc_dir_entry *parent, 744 struct proc_dir_entry *parent,
@@ -762,6 +773,7 @@ out_free:
762out: 773out:
763 return NULL; 774 return NULL;
764} 775}
776EXPORT_SYMBOL(proc_create_data);
765 777
766static void free_proc_entry(struct proc_dir_entry *de) 778static void free_proc_entry(struct proc_dir_entry *de)
767{ 779{
@@ -793,11 +805,13 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
793 const char *fn = name; 805 const char *fn = name;
794 int len; 806 int len;
795 807
796 if (xlate_proc_name(name, &parent, &fn) != 0) 808 spin_lock(&proc_subdir_lock);
809 if (__xlate_proc_name(name, &parent, &fn) != 0) {
810 spin_unlock(&proc_subdir_lock);
797 return; 811 return;
812 }
798 len = strlen(fn); 813 len = strlen(fn);
799 814
800 spin_lock(&proc_subdir_lock);
801 for (p = &parent->subdir; *p; p=&(*p)->next ) { 815 for (p = &parent->subdir; *p; p=&(*p)->next ) {
802 if (proc_match(len, fn, *p)) { 816 if (proc_match(len, fn, *p)) {
803 de = *p; 817 de = *p;
@@ -807,8 +821,10 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
807 } 821 }
808 } 822 }
809 spin_unlock(&proc_subdir_lock); 823 spin_unlock(&proc_subdir_lock);
810 if (!de) 824 if (!de) {
825 WARN(1, "name '%s'\n", name);
811 return; 826 return;
827 }
812 828
813 spin_lock(&de->pde_unload_lock); 829 spin_lock(&de->pde_unload_lock);
814 /* 830 /*
@@ -853,3 +869,4 @@ continue_removing:
853 de->parent->name, de->name, de->subdir->name); 869 de->parent->name, de->name, de->subdir->name);
854 pde_put(de); 870 pde_put(de);
855} 871}
872EXPORT_SYMBOL(remove_proc_entry);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 445a02bcaab3..d35b23238fb1 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -18,6 +18,7 @@
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
20#include <linux/sysctl.h> 20#include <linux/sysctl.h>
21#include <linux/slab.h>
21 22
22#include <asm/system.h> 23#include <asm/system.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a44a7897fd4d..19979a2ce272 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -19,6 +19,7 @@
19#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <linux/bootmem.h> 20#include <linux/bootmem.h>
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/slab.h>
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23#include <asm/io.h> 24#include <asm/io.h>
24#include <linux/list.h> 25#include <linux/list.h>
@@ -490,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
490 } 491 }
491 read_unlock(&kclist_lock); 492 read_unlock(&kclist_lock);
492 493
493 if (m == NULL) { 494 if (&m->list == &kclist_head) {
494 if (clear_user(buffer, tsz)) 495 if (clear_user(buffer, tsz))
495 return -EFAULT; 496 return -EFAULT;
496 } else if (is_vmalloc_or_module_addr((void *)start)) { 497 } else if (is_vmalloc_or_module_addr((void *)start)) {
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 7ca78346d3f0..cfe90a48a6e8 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -12,37 +12,37 @@
12#include <linux/poll.h> 12#include <linux/poll.h>
13#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/syslog.h>
15 16
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include <asm/io.h> 18#include <asm/io.h>
18 19
19extern wait_queue_head_t log_wait; 20extern wait_queue_head_t log_wait;
20 21
21extern int do_syslog(int type, char __user *bug, int count);
22
23static int kmsg_open(struct inode * inode, struct file * file) 22static int kmsg_open(struct inode * inode, struct file * file)
24{ 23{
25 return do_syslog(1,NULL,0); 24 return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE);
26} 25}
27 26
28static int kmsg_release(struct inode * inode, struct file * file) 27static int kmsg_release(struct inode * inode, struct file * file)
29{ 28{
30 (void) do_syslog(0,NULL,0); 29 (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE);
31 return 0; 30 return 0;
32} 31}
33 32
34static ssize_t kmsg_read(struct file *file, char __user *buf, 33static ssize_t kmsg_read(struct file *file, char __user *buf,
35 size_t count, loff_t *ppos) 34 size_t count, loff_t *ppos)
36{ 35{
37 if ((file->f_flags & O_NONBLOCK) && !do_syslog(9, NULL, 0)) 36 if ((file->f_flags & O_NONBLOCK) &&
37 !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
38 return -EAGAIN; 38 return -EAGAIN;
39 return do_syslog(2, buf, count); 39 return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE);
40} 40}
41 41
42static unsigned int kmsg_poll(struct file *file, poll_table *wait) 42static unsigned int kmsg_poll(struct file *file, poll_table *wait)
43{ 43{
44 poll_wait(file, &log_wait, wait); 44 poll_wait(file, &log_wait, wait);
45 if (do_syslog(9, NULL, 0)) 45 if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
46 return POLLIN | POLLRDNORM; 46 return POLLIN | POLLRDNORM;
47 return 0; 47 return 0;
48} 48}
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 9fe7d7ebe115..b1822dde55c2 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
21#include <linux/mmzone.h> 21#include <linux/mmzone.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/slab.h>
25#include <linux/smp.h> 24#include <linux/smp.h>
26#include <linux/seq_file.h> 25#include <linux/seq_file.h>
27#include <linux/hugetlb.h> 26#include <linux/hugetlb.h>
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 123257bb356b..ce94801f48ca 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -10,16 +10,20 @@
10#include <linux/seq_file.h> 10#include <linux/seq_file.h>
11#include <linux/stat.h> 11#include <linux/stat.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/of.h>
14#include <linux/module.h>
15#include <linux/slab.h>
13#include <asm/prom.h> 16#include <asm/prom.h>
14#include <asm/uaccess.h> 17#include <asm/uaccess.h>
15#include "internal.h" 18#include "internal.h"
16 19
17#ifndef HAVE_ARCH_DEVTREE_FIXUPS
18static inline void set_node_proc_entry(struct device_node *np, 20static inline void set_node_proc_entry(struct device_node *np,
19 struct proc_dir_entry *de) 21 struct proc_dir_entry *de)
20{ 22{
21} 23#ifdef HAVE_ARCH_DEVTREE_FIXUPS
24 np->pde = de;
22#endif 25#endif
26}
23 27
24static struct proc_dir_entry *proc_device_tree; 28static struct proc_dir_entry *proc_device_tree;
25 29
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 04d1270f1c38..9020ac15baaa 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -14,6 +14,7 @@
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/proc_fs.h> 15#include <linux/proc_fs.h>
16#include <linux/stat.h> 16#include <linux/stat.h>
17#include <linux/slab.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/module.h> 20#include <linux/module.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b080b791d9e3..757c069f2a65 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -220,9 +220,3 @@ void pid_ns_release_proc(struct pid_namespace *ns)
220{ 220{
221 mntput(ns->proc_mnt); 221 mntput(ns->proc_mnt);
222} 222}
223
224EXPORT_SYMBOL(proc_symlink);
225EXPORT_SYMBOL(proc_mkdir);
226EXPORT_SYMBOL(create_proc_entry);
227EXPORT_SYMBOL(proc_create_data);
228EXPORT_SYMBOL(remove_proc_entry);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index b9b7aad2003d..bf31b03fc275 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -1,6 +1,5 @@
1#include <linux/cpumask.h> 1#include <linux/cpumask.h>
2#include <linux/fs.h> 2#include <linux/fs.h>
3#include <linux/gfp.h>
4#include <linux/init.h> 3#include <linux/init.h>
5#include <linux/interrupt.h> 4#include <linux/interrupt.h>
6#include <linux/kernel_stat.h> 5#include <linux/kernel_stat.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 47c03f4336b8..070553427dd5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -4,6 +4,7 @@
4#include <linux/seq_file.h> 4#include <linux/seq_file.h>
5#include <linux/highmem.h> 5#include <linux/highmem.h>
6#include <linux/ptrace.h> 6#include <linux/ptrace.h>
7#include <linux/slab.h>
7#include <linux/pagemap.h> 8#include <linux/pagemap.h>
8#include <linux/mempolicy.h> 9#include <linux/mempolicy.h>
9#include <linux/swap.h> 10#include <linux/swap.h>
@@ -16,7 +17,7 @@
16 17
17void task_mem(struct seq_file *m, struct mm_struct *mm) 18void task_mem(struct seq_file *m, struct mm_struct *mm)
18{ 19{
19 unsigned long data, text, lib; 20 unsigned long data, text, lib, swap;
20 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; 21 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
21 22
22 /* 23 /*
@@ -36,6 +37,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
36 data = mm->total_vm - mm->shared_vm - mm->stack_vm; 37 data = mm->total_vm - mm->shared_vm - mm->stack_vm;
37 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; 38 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
38 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; 39 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
40 swap = get_mm_counter(mm, MM_SWAPENTS);
39 seq_printf(m, 41 seq_printf(m,
40 "VmPeak:\t%8lu kB\n" 42 "VmPeak:\t%8lu kB\n"
41 "VmSize:\t%8lu kB\n" 43 "VmSize:\t%8lu kB\n"
@@ -46,7 +48,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
46 "VmStk:\t%8lu kB\n" 48 "VmStk:\t%8lu kB\n"
47 "VmExe:\t%8lu kB\n" 49 "VmExe:\t%8lu kB\n"
48 "VmLib:\t%8lu kB\n" 50 "VmLib:\t%8lu kB\n"
49 "VmPTE:\t%8lu kB\n", 51 "VmPTE:\t%8lu kB\n"
52 "VmSwap:\t%8lu kB\n",
50 hiwater_vm << (PAGE_SHIFT-10), 53 hiwater_vm << (PAGE_SHIFT-10),
51 (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), 54 (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
52 mm->locked_vm << (PAGE_SHIFT-10), 55 mm->locked_vm << (PAGE_SHIFT-10),
@@ -54,7 +57,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
54 total_rss << (PAGE_SHIFT-10), 57 total_rss << (PAGE_SHIFT-10),
55 data << (PAGE_SHIFT-10), 58 data << (PAGE_SHIFT-10),
56 mm->stack_vm << (PAGE_SHIFT-10), text, lib, 59 mm->stack_vm << (PAGE_SHIFT-10), text, lib,
57 (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); 60 (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
61 swap << (PAGE_SHIFT-10));
58} 62}
59 63
60unsigned long task_vsize(struct mm_struct *mm) 64unsigned long task_vsize(struct mm_struct *mm)
@@ -65,11 +69,11 @@ unsigned long task_vsize(struct mm_struct *mm)
65int task_statm(struct mm_struct *mm, int *shared, int *text, 69int task_statm(struct mm_struct *mm, int *shared, int *text,
66 int *data, int *resident) 70 int *data, int *resident)
67{ 71{
68 *shared = get_mm_counter(mm, file_rss); 72 *shared = get_mm_counter(mm, MM_FILEPAGES);
69 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) 73 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
70 >> PAGE_SHIFT; 74 >> PAGE_SHIFT;
71 *data = mm->total_vm - mm->shared_vm; 75 *data = mm->total_vm - mm->shared_vm;
72 *resident = *shared + get_mm_counter(mm, anon_rss); 76 *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
73 return mm->total_vm; 77 return mm->total_vm;
74} 78}
75 79
@@ -361,12 +365,11 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
361 if (!pte_present(ptent)) 365 if (!pte_present(ptent))
362 continue; 366 continue;
363 367
364 mss->resident += PAGE_SIZE;
365
366 page = vm_normal_page(vma, addr, ptent); 368 page = vm_normal_page(vma, addr, ptent);
367 if (!page) 369 if (!page)
368 continue; 370 continue;
369 371
372 mss->resident += PAGE_SIZE;
370 /* Accumulate the size in pages that have been accessed. */ 373 /* Accumulate the size in pages that have been accessed. */
371 if (pte_young(ptent) || PageReferenced(page)) 374 if (pte_young(ptent) || PageReferenced(page))
372 mss->referenced += PAGE_SIZE; 375 mss->referenced += PAGE_SIZE;
@@ -404,6 +407,7 @@ static int show_smap(struct seq_file *m, void *v)
404 407
405 memset(&mss, 0, sizeof mss); 408 memset(&mss, 0, sizeof mss);
406 mss.vma = vma; 409 mss.vma = vma;
410 /* mmap_sem is held in m_start */
407 if (vma->vm_mm && !is_vm_hugetlb_page(vma)) 411 if (vma->vm_mm && !is_vm_hugetlb_page(vma))
408 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); 412 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
409 413
@@ -550,7 +554,8 @@ const struct file_operations proc_clear_refs_operations = {
550}; 554};
551 555
552struct pagemapread { 556struct pagemapread {
553 u64 __user *out, *end; 557 int pos, len;
558 u64 *buffer;
554}; 559};
555 560
556#define PM_ENTRY_BYTES sizeof(u64) 561#define PM_ENTRY_BYTES sizeof(u64)
@@ -573,10 +578,8 @@ struct pagemapread {
573static int add_to_pagemap(unsigned long addr, u64 pfn, 578static int add_to_pagemap(unsigned long addr, u64 pfn,
574 struct pagemapread *pm) 579 struct pagemapread *pm)
575{ 580{
576 if (put_user(pfn, pm->out)) 581 pm->buffer[pm->pos++] = pfn;
577 return -EFAULT; 582 if (pm->pos >= pm->len)
578 pm->out++;
579 if (pm->out >= pm->end)
580 return PM_END_OF_BUFFER; 583 return PM_END_OF_BUFFER;
581 return 0; 584 return 0;
582} 585}
@@ -659,31 +662,18 @@ static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
659 return pme; 662 return pme;
660} 663}
661 664
662static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr, 665/* This function walks within one hugetlb entry in the single call */
663 unsigned long end, struct mm_walk *walk) 666static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
667 unsigned long addr, unsigned long end,
668 struct mm_walk *walk)
664{ 669{
665 struct vm_area_struct *vma;
666 struct pagemapread *pm = walk->private; 670 struct pagemapread *pm = walk->private;
667 struct hstate *hs = NULL;
668 int err = 0; 671 int err = 0;
672 u64 pfn;
669 673
670 vma = find_vma(walk->mm, addr);
671 if (vma)
672 hs = hstate_vma(vma);
673 for (; addr != end; addr += PAGE_SIZE) { 674 for (; addr != end; addr += PAGE_SIZE) {
674 u64 pfn = PM_NOT_PRESENT; 675 int offset = (addr & ~hmask) >> PAGE_SHIFT;
675 676 pfn = huge_pte_to_pagemap_entry(*pte, offset);
676 if (vma && (addr >= vma->vm_end)) {
677 vma = find_vma(walk->mm, addr);
678 if (vma)
679 hs = hstate_vma(vma);
680 }
681
682 if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) {
683 /* calculate pfn of the "raw" page in the hugepage. */
684 int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT;
685 pfn = huge_pte_to_pagemap_entry(*pte, offset);
686 }
687 err = add_to_pagemap(addr, pfn, pm); 677 err = add_to_pagemap(addr, pfn, pm);
688 if (err) 678 if (err)
689 return err; 679 return err;
@@ -718,21 +708,20 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
718 * determine which areas of memory are actually mapped and llseek to 708 * determine which areas of memory are actually mapped and llseek to
719 * skip over unmapped regions. 709 * skip over unmapped regions.
720 */ 710 */
711#define PAGEMAP_WALK_SIZE (PMD_SIZE)
721static ssize_t pagemap_read(struct file *file, char __user *buf, 712static ssize_t pagemap_read(struct file *file, char __user *buf,
722 size_t count, loff_t *ppos) 713 size_t count, loff_t *ppos)
723{ 714{
724 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 715 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
725 struct page **pages, *page;
726 unsigned long uaddr, uend;
727 struct mm_struct *mm; 716 struct mm_struct *mm;
728 struct pagemapread pm; 717 struct pagemapread pm;
729 int pagecount;
730 int ret = -ESRCH; 718 int ret = -ESRCH;
731 struct mm_walk pagemap_walk = {}; 719 struct mm_walk pagemap_walk = {};
732 unsigned long src; 720 unsigned long src;
733 unsigned long svpfn; 721 unsigned long svpfn;
734 unsigned long start_vaddr; 722 unsigned long start_vaddr;
735 unsigned long end_vaddr; 723 unsigned long end_vaddr;
724 int copied = 0;
736 725
737 if (!task) 726 if (!task)
738 goto out; 727 goto out;
@@ -755,35 +744,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
755 if (!mm) 744 if (!mm)
756 goto out_task; 745 goto out_task;
757 746
758 747 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
759 uaddr = (unsigned long)buf & PAGE_MASK; 748 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
760 uend = (unsigned long)(buf + count);
761 pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
762 ret = 0;
763 if (pagecount == 0)
764 goto out_mm;
765 pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
766 ret = -ENOMEM; 749 ret = -ENOMEM;
767 if (!pages) 750 if (!pm.buffer)
768 goto out_mm; 751 goto out_mm;
769 752
770 down_read(&current->mm->mmap_sem);
771 ret = get_user_pages(current, current->mm, uaddr, pagecount,
772 1, 0, pages, NULL);
773 up_read(&current->mm->mmap_sem);
774
775 if (ret < 0)
776 goto out_free;
777
778 if (ret != pagecount) {
779 pagecount = ret;
780 ret = -EFAULT;
781 goto out_pages;
782 }
783
784 pm.out = (u64 __user *)buf;
785 pm.end = (u64 __user *)(buf + count);
786
787 pagemap_walk.pmd_entry = pagemap_pte_range; 753 pagemap_walk.pmd_entry = pagemap_pte_range;
788 pagemap_walk.pte_hole = pagemap_pte_hole; 754 pagemap_walk.pte_hole = pagemap_pte_hole;
789 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; 755 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
@@ -805,23 +771,36 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
805 * user buffer is tracked in "pm", and the walk 771 * user buffer is tracked in "pm", and the walk
806 * will stop when we hit the end of the buffer. 772 * will stop when we hit the end of the buffer.
807 */ 773 */
808 ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk); 774 ret = 0;
809 if (ret == PM_END_OF_BUFFER) 775 while (count && (start_vaddr < end_vaddr)) {
810 ret = 0; 776 int len;
811 /* don't need mmap_sem for these, but this looks cleaner */ 777 unsigned long end;
812 *ppos += (char __user *)pm.out - buf; 778
813 if (!ret) 779 pm.pos = 0;
814 ret = (char __user *)pm.out - buf; 780 end = start_vaddr + PAGEMAP_WALK_SIZE;
815 781 /* overflow ? */
816out_pages: 782 if (end < start_vaddr || end > end_vaddr)
817 for (; pagecount; pagecount--) { 783 end = end_vaddr;
818 page = pages[pagecount-1]; 784 down_read(&mm->mmap_sem);
819 if (!PageReserved(page)) 785 ret = walk_page_range(start_vaddr, end, &pagemap_walk);
820 SetPageDirty(page); 786 up_read(&mm->mmap_sem);
821 page_cache_release(page); 787 start_vaddr = end;
788
789 len = min(count, PM_ENTRY_BYTES * pm.pos);
790 if (copy_to_user(buf, pm.buffer, len)) {
791 ret = -EFAULT;
792 goto out_free;
793 }
794 copied += len;
795 buf += len;
796 count -= len;
822 } 797 }
798 *ppos += copied;
799 if (!ret || ret == PM_END_OF_BUFFER)
800 ret = copied;
801
823out_free: 802out_free:
824 kfree(pages); 803 kfree(pm.buffer);
825out_mm: 804out_mm:
826 mmput(mm); 805 mmput(mm);
827out_task: 806out_task:
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5d9fd64ef81a..46d4b5d72bd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -5,6 +5,7 @@
5#include <linux/fs_struct.h> 5#include <linux/fs_struct.h>
6#include <linux/mount.h> 6#include <linux/mount.h>
7#include <linux/ptrace.h> 7#include <linux/ptrace.h>
8#include <linux/slab.h>
8#include <linux/seq_file.h> 9#include <linux/seq_file.h>
9#include "internal.h" 10#include "internal.h"
10 11
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 0872afa58d39..9fbc99ec799a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
12#include <linux/user.h> 12#include <linux/user.h>
13#include <linux/elf.h> 13#include <linux/elf.h>
14#include <linux/elfcore.h> 14#include <linux/elfcore.h>
15#include <linux/slab.h>
15#include <linux/highmem.h> 16#include <linux/highmem.h>
16#include <linux/bootmem.h> 17#include <linux/bootmem.h>
17#include <linux/init.h> 18#include <linux/init.h>
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index ebf3440d28ca..277575ddc05c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -201,7 +201,8 @@ static const char *qnx4_checkroot(struct super_block *sb)
201 rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE); 201 rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
202 if (rootdir->di_fname != NULL) { 202 if (rootdir->di_fname != NULL) {
203 QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname)); 203 QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
204 if (!strncmp(rootdir->di_fname, QNX4_BMNAME, sizeof QNX4_BMNAME)) { 204 if (!strcmp(rootdir->di_fname,
205 QNX4_BMNAME)) {
205 found = 1; 206 found = 1;
206 qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL ); 207 qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL );
207 if (!qnx4_sb(sb)->BitMap) { 208 if (!qnx4_sb(sb)->BitMap) {
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index efc02ebb8c70..3e21b1e2ad3a 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -33,6 +33,14 @@ config PRINT_QUOTA_WARNING
33 Note that this behavior is currently deprecated and may go away in 33 Note that this behavior is currently deprecated and may go away in
34 future. Please use notification via netlink socket instead. 34 future. Please use notification via netlink socket instead.
35 35
36config QUOTA_DEBUG
37 bool "Additional quota sanity checks"
38 depends on QUOTA
39 default n
40 help
41 If you say Y here, quota subsystem will perform some additional
42 sanity checks of quota internal structures. If unsure, say N.
43
36# Generic support for tree structured quota files. Selected when needed. 44# Generic support for tree structured quota files. Selected when needed.
37config QUOTA_TREE 45config QUOTA_TREE
38 tristate 46 tristate
@@ -59,3 +67,8 @@ config QUOTACTL
59 bool 67 bool
60 depends on XFS_QUOTA || QUOTA 68 depends on XFS_QUOTA || QUOTA
61 default y 69 default y
70
71config QUOTACTL_COMPAT
72 bool
73 depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT
74 default y
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index 68d4f6dc0578..5f9e9e276af0 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -3,3 +3,5 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o
3obj-$(CONFIG_QFMT_V2) += quota_v2.o 3obj-$(CONFIG_QFMT_V2) += quota_v2.o
4obj-$(CONFIG_QUOTA_TREE) += quota_tree.o 4obj-$(CONFIG_QUOTA_TREE) += quota_tree.o
5obj-$(CONFIG_QUOTACTL) += quota.o 5obj-$(CONFIG_QUOTACTL) += quota.o
6obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o
7obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o
diff --git a/fs/quota/compat.c b/fs/quota/compat.c
new file mode 100644
index 000000000000..fb1892fe3e56
--- /dev/null
+++ b/fs/quota/compat.c
@@ -0,0 +1,118 @@
1
2#include <linux/syscalls.h>
3#include <linux/compat.h>
4#include <linux/quotaops.h>
5
6/*
7 * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
8 * and is necessary due to alignment problems.
9 */
10struct compat_if_dqblk {
11 compat_u64 dqb_bhardlimit;
12 compat_u64 dqb_bsoftlimit;
13 compat_u64 dqb_curspace;
14 compat_u64 dqb_ihardlimit;
15 compat_u64 dqb_isoftlimit;
16 compat_u64 dqb_curinodes;
17 compat_u64 dqb_btime;
18 compat_u64 dqb_itime;
19 compat_uint_t dqb_valid;
20};
21
22/* XFS structures */
23struct compat_fs_qfilestat {
24 compat_u64 dqb_bhardlimit;
25 compat_u64 qfs_nblks;
26 compat_uint_t qfs_nextents;
27};
28
29struct compat_fs_quota_stat {
30 __s8 qs_version;
31 __u16 qs_flags;
32 __s8 qs_pad;
33 struct compat_fs_qfilestat qs_uquota;
34 struct compat_fs_qfilestat qs_gquota;
35 compat_uint_t qs_incoredqs;
36 compat_int_t qs_btimelimit;
37 compat_int_t qs_itimelimit;
38 compat_int_t qs_rtbtimelimit;
39 __u16 qs_bwarnlimit;
40 __u16 qs_iwarnlimit;
41};
42
43asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
44 qid_t id, void __user *addr)
45{
46 unsigned int cmds;
47 struct if_dqblk __user *dqblk;
48 struct compat_if_dqblk __user *compat_dqblk;
49 struct fs_quota_stat __user *fsqstat;
50 struct compat_fs_quota_stat __user *compat_fsqstat;
51 compat_uint_t data;
52 u16 xdata;
53 long ret;
54
55 cmds = cmd >> SUBCMDSHIFT;
56
57 switch (cmds) {
58 case Q_GETQUOTA:
59 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
60 compat_dqblk = addr;
61 ret = sys_quotactl(cmd, special, id, dqblk);
62 if (ret)
63 break;
64 if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
65 get_user(data, &dqblk->dqb_valid) ||
66 put_user(data, &compat_dqblk->dqb_valid))
67 ret = -EFAULT;
68 break;
69 case Q_SETQUOTA:
70 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
71 compat_dqblk = addr;
72 ret = -EFAULT;
73 if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
74 get_user(data, &compat_dqblk->dqb_valid) ||
75 put_user(data, &dqblk->dqb_valid))
76 break;
77 ret = sys_quotactl(cmd, special, id, dqblk);
78 break;
79 case Q_XGETQSTAT:
80 fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
81 compat_fsqstat = addr;
82 ret = sys_quotactl(cmd, special, id, fsqstat);
83 if (ret)
84 break;
85 ret = -EFAULT;
86 /* Copying qs_version, qs_flags, qs_pad */
87 if (copy_in_user(compat_fsqstat, fsqstat,
88 offsetof(struct compat_fs_quota_stat, qs_uquota)))
89 break;
90 /* Copying qs_uquota */
91 if (copy_in_user(&compat_fsqstat->qs_uquota,
92 &fsqstat->qs_uquota,
93 sizeof(compat_fsqstat->qs_uquota)) ||
94 get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
95 put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
96 break;
97 /* Copying qs_gquota */
98 if (copy_in_user(&compat_fsqstat->qs_gquota,
99 &fsqstat->qs_gquota,
100 sizeof(compat_fsqstat->qs_gquota)) ||
101 get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
102 put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
103 break;
104 /* Copying the rest */
105 if (copy_in_user(&compat_fsqstat->qs_incoredqs,
106 &fsqstat->qs_incoredqs,
107 sizeof(struct compat_fs_quota_stat) -
108 offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
109 get_user(xdata, &fsqstat->qs_iwarnlimit) ||
110 put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
111 break;
112 ret = 0;
113 break;
114 default:
115 ret = sys_quotactl(cmd, special, id, addr);
116 }
117 return ret;
118}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index dea86abdf2e7..788b5802a7ce 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -80,8 +80,6 @@
80 80
81#include <asm/uaccess.h> 81#include <asm/uaccess.h>
82 82
83#define __DQUOT_PARANOIA
84
85/* 83/*
86 * There are three quota SMP locks. dq_list_lock protects all lists with quotas 84 * There are three quota SMP locks. dq_list_lock protects all lists with quotas
87 * and quota formats, dqstats structure containing statistics about the lists 85 * and quota formats, dqstats structure containing statistics about the lists
@@ -100,9 +98,13 @@
100 * 98 *
101 * Any operation working on dquots via inode pointers must hold dqptr_sem. If 99 * Any operation working on dquots via inode pointers must hold dqptr_sem. If
102 * operation is just reading pointers from inode (or not using them at all) the 100 * operation is just reading pointers from inode (or not using them at all) the
103 * read lock is enough. If pointers are altered function must hold write lock 101 * read lock is enough. If pointers are altered function must hold write lock.
104 * (these locking rules also apply for S_NOQUOTA flag in the inode - note that 102 * Special care needs to be taken about S_NOQUOTA inode flag (marking that
105 * for altering the flag i_mutex is also needed). 103 * inode is a quota file). Functions adding pointers from inode to dquots have
104 * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they
105 * have to do all pointer modifications before dropping dqptr_sem. This makes
106 * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
107 * then drops all pointers to dquots from an inode.
106 * 108 *
107 * Each dquot has its dq_lock mutex. Locked dquots might not be referenced 109 * Each dquot has its dq_lock mutex. Locked dquots might not be referenced
108 * from inodes (dquot_alloc_space() and such don't check the dq_lock). 110 * from inodes (dquot_alloc_space() and such don't check the dq_lock).
@@ -225,6 +227,9 @@ static struct hlist_head *dquot_hash;
225struct dqstats dqstats; 227struct dqstats dqstats;
226EXPORT_SYMBOL(dqstats); 228EXPORT_SYMBOL(dqstats);
227 229
230static qsize_t inode_get_rsv_space(struct inode *inode);
231static void __dquot_initialize(struct inode *inode, int type);
232
228static inline unsigned int 233static inline unsigned int
229hashfn(const struct super_block *sb, unsigned int id, int type) 234hashfn(const struct super_block *sb, unsigned int id, int type)
230{ 235{
@@ -564,7 +569,7 @@ out:
564} 569}
565EXPORT_SYMBOL(dquot_scan_active); 570EXPORT_SYMBOL(dquot_scan_active);
566 571
567int vfs_quota_sync(struct super_block *sb, int type) 572int vfs_quota_sync(struct super_block *sb, int type, int wait)
568{ 573{
569 struct list_head *dirty; 574 struct list_head *dirty;
570 struct dquot *dquot; 575 struct dquot *dquot;
@@ -609,6 +614,33 @@ int vfs_quota_sync(struct super_block *sb, int type)
609 spin_unlock(&dq_list_lock); 614 spin_unlock(&dq_list_lock);
610 mutex_unlock(&dqopt->dqonoff_mutex); 615 mutex_unlock(&dqopt->dqonoff_mutex);
611 616
617 if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE))
618 return 0;
619
620 /* This is not very clever (and fast) but currently I don't know about
621 * any other simple way of getting quota data to disk and we must get
622 * them there for userspace to be visible... */
623 if (sb->s_op->sync_fs)
624 sb->s_op->sync_fs(sb, 1);
625 sync_blockdev(sb->s_bdev);
626
627 /*
628 * Now when everything is written we can discard the pagecache so
629 * that userspace sees the changes.
630 */
631 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
632 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
633 if (type != -1 && cnt != type)
634 continue;
635 if (!sb_has_quota_active(sb, cnt))
636 continue;
637 mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
638 I_MUTEX_QUOTA);
639 truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
640 mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
641 }
642 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
643
612 return 0; 644 return 0;
613} 645}
614EXPORT_SYMBOL(vfs_quota_sync); 646EXPORT_SYMBOL(vfs_quota_sync);
@@ -661,7 +693,7 @@ void dqput(struct dquot *dquot)
661 693
662 if (!dquot) 694 if (!dquot)
663 return; 695 return;
664#ifdef __DQUOT_PARANOIA 696#ifdef CONFIG_QUOTA_DEBUG
665 if (!atomic_read(&dquot->dq_count)) { 697 if (!atomic_read(&dquot->dq_count)) {
666 printk("VFS: dqput: trying to free free dquot\n"); 698 printk("VFS: dqput: trying to free free dquot\n");
667 printk("VFS: device %s, dquot of %s %d\n", 699 printk("VFS: device %s, dquot of %s %d\n",
@@ -714,7 +746,7 @@ we_slept:
714 goto we_slept; 746 goto we_slept;
715 } 747 }
716 atomic_dec(&dquot->dq_count); 748 atomic_dec(&dquot->dq_count);
717#ifdef __DQUOT_PARANOIA 749#ifdef CONFIG_QUOTA_DEBUG
718 /* sanity check */ 750 /* sanity check */
719 BUG_ON(!list_empty(&dquot->dq_free)); 751 BUG_ON(!list_empty(&dquot->dq_free));
720#endif 752#endif
@@ -811,7 +843,7 @@ we_slept:
811 dquot = NULL; 843 dquot = NULL;
812 goto out; 844 goto out;
813 } 845 }
814#ifdef __DQUOT_PARANOIA 846#ifdef CONFIG_QUOTA_DEBUG
815 BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ 847 BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */
816#endif 848#endif
817out: 849out:
@@ -840,11 +872,18 @@ static int dqinit_needed(struct inode *inode, int type)
840static void add_dquot_ref(struct super_block *sb, int type) 872static void add_dquot_ref(struct super_block *sb, int type)
841{ 873{
842 struct inode *inode, *old_inode = NULL; 874 struct inode *inode, *old_inode = NULL;
875#ifdef CONFIG_QUOTA_DEBUG
876 int reserved = 0;
877#endif
843 878
844 spin_lock(&inode_lock); 879 spin_lock(&inode_lock);
845 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 880 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
846 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 881 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
847 continue; 882 continue;
883#ifdef CONFIG_QUOTA_DEBUG
884 if (unlikely(inode_get_rsv_space(inode) > 0))
885 reserved = 1;
886#endif
848 if (!atomic_read(&inode->i_writecount)) 887 if (!atomic_read(&inode->i_writecount))
849 continue; 888 continue;
850 if (!dqinit_needed(inode, type)) 889 if (!dqinit_needed(inode, type))
@@ -854,7 +893,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
854 spin_unlock(&inode_lock); 893 spin_unlock(&inode_lock);
855 894
856 iput(old_inode); 895 iput(old_inode);
857 sb->dq_op->initialize(inode, type); 896 __dquot_initialize(inode, type);
858 /* We hold a reference to 'inode' so it couldn't have been 897 /* We hold a reference to 'inode' so it couldn't have been
859 * removed from s_inodes list while we dropped the inode_lock. 898 * removed from s_inodes list while we dropped the inode_lock.
860 * We cannot iput the inode now as we can be holding the last 899 * We cannot iput the inode now as we can be holding the last
@@ -865,6 +904,14 @@ static void add_dquot_ref(struct super_block *sb, int type)
865 } 904 }
866 spin_unlock(&inode_lock); 905 spin_unlock(&inode_lock);
867 iput(old_inode); 906 iput(old_inode);
907
908#ifdef CONFIG_QUOTA_DEBUG
909 if (reserved) {
910 printk(KERN_WARNING "VFS (%s): Writes happened before quota"
911 " was turned on thus quota information is probably "
912 "inconsistent. Please run quotacheck(8).\n", sb->s_id);
913 }
914#endif
868} 915}
869 916
870/* 917/*
@@ -891,7 +938,7 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
891 inode->i_dquot[type] = NULL; 938 inode->i_dquot[type] = NULL;
892 if (dquot) { 939 if (dquot) {
893 if (dqput_blocks(dquot)) { 940 if (dqput_blocks(dquot)) {
894#ifdef __DQUOT_PARANOIA 941#ifdef CONFIG_QUOTA_DEBUG
895 if (atomic_read(&dquot->dq_count) != 1) 942 if (atomic_read(&dquot->dq_count) != 1)
896 printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count)); 943 printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count));
897#endif 944#endif
@@ -978,10 +1025,12 @@ static inline void dquot_resv_space(struct dquot *dquot, qsize_t number)
978/* 1025/*
979 * Claim reserved quota space 1026 * Claim reserved quota space
980 */ 1027 */
981static void dquot_claim_reserved_space(struct dquot *dquot, 1028static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number)
982 qsize_t number)
983{ 1029{
984 WARN_ON(dquot->dq_dqb.dqb_rsvspace < number); 1030 if (dquot->dq_dqb.dqb_rsvspace < number) {
1031 WARN_ON_ONCE(1);
1032 number = dquot->dq_dqb.dqb_rsvspace;
1033 }
985 dquot->dq_dqb.dqb_curspace += number; 1034 dquot->dq_dqb.dqb_curspace += number;
986 dquot->dq_dqb.dqb_rsvspace -= number; 1035 dquot->dq_dqb.dqb_rsvspace -= number;
987} 1036}
@@ -989,7 +1038,12 @@ static void dquot_claim_reserved_space(struct dquot *dquot,
989static inline 1038static inline
990void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) 1039void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
991{ 1040{
992 dquot->dq_dqb.dqb_rsvspace -= number; 1041 if (dquot->dq_dqb.dqb_rsvspace >= number)
1042 dquot->dq_dqb.dqb_rsvspace -= number;
1043 else {
1044 WARN_ON_ONCE(1);
1045 dquot->dq_dqb.dqb_rsvspace = 0;
1046 }
993} 1047}
994 1048
995static void dquot_decr_inodes(struct dquot *dquot, qsize_t number) 1049static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
@@ -1131,13 +1185,13 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
1131 *warntype = QUOTA_NL_NOWARN; 1185 *warntype = QUOTA_NL_NOWARN;
1132 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || 1186 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
1133 test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1187 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1134 return QUOTA_OK; 1188 return 0;
1135 1189
1136 if (dquot->dq_dqb.dqb_ihardlimit && 1190 if (dquot->dq_dqb.dqb_ihardlimit &&
1137 newinodes > dquot->dq_dqb.dqb_ihardlimit && 1191 newinodes > dquot->dq_dqb.dqb_ihardlimit &&
1138 !ignore_hardlimit(dquot)) { 1192 !ignore_hardlimit(dquot)) {
1139 *warntype = QUOTA_NL_IHARDWARN; 1193 *warntype = QUOTA_NL_IHARDWARN;
1140 return NO_QUOTA; 1194 return -EDQUOT;
1141 } 1195 }
1142 1196
1143 if (dquot->dq_dqb.dqb_isoftlimit && 1197 if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1146,7 +1200,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
1146 get_seconds() >= dquot->dq_dqb.dqb_itime && 1200 get_seconds() >= dquot->dq_dqb.dqb_itime &&
1147 !ignore_hardlimit(dquot)) { 1201 !ignore_hardlimit(dquot)) {
1148 *warntype = QUOTA_NL_ISOFTLONGWARN; 1202 *warntype = QUOTA_NL_ISOFTLONGWARN;
1149 return NO_QUOTA; 1203 return -EDQUOT;
1150 } 1204 }
1151 1205
1152 if (dquot->dq_dqb.dqb_isoftlimit && 1206 if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1157,7 +1211,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
1157 sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; 1211 sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
1158 } 1212 }
1159 1213
1160 return QUOTA_OK; 1214 return 0;
1161} 1215}
1162 1216
1163/* needs dq_data_lock */ 1217/* needs dq_data_lock */
@@ -1169,7 +1223,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1169 *warntype = QUOTA_NL_NOWARN; 1223 *warntype = QUOTA_NL_NOWARN;
1170 if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) || 1224 if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) ||
1171 test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1225 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1172 return QUOTA_OK; 1226 return 0;
1173 1227
1174 tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace 1228 tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
1175 + space; 1229 + space;
@@ -1179,7 +1233,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1179 !ignore_hardlimit(dquot)) { 1233 !ignore_hardlimit(dquot)) {
1180 if (!prealloc) 1234 if (!prealloc)
1181 *warntype = QUOTA_NL_BHARDWARN; 1235 *warntype = QUOTA_NL_BHARDWARN;
1182 return NO_QUOTA; 1236 return -EDQUOT;
1183 } 1237 }
1184 1238
1185 if (dquot->dq_dqb.dqb_bsoftlimit && 1239 if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1189,7 +1243,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1189 !ignore_hardlimit(dquot)) { 1243 !ignore_hardlimit(dquot)) {
1190 if (!prealloc) 1244 if (!prealloc)
1191 *warntype = QUOTA_NL_BSOFTLONGWARN; 1245 *warntype = QUOTA_NL_BSOFTLONGWARN;
1192 return NO_QUOTA; 1246 return -EDQUOT;
1193 } 1247 }
1194 1248
1195 if (dquot->dq_dqb.dqb_bsoftlimit && 1249 if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1205,10 +1259,10 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1205 * We don't allow preallocation to exceed softlimit so exceeding will 1259 * We don't allow preallocation to exceed softlimit so exceeding will
1206 * be always printed 1260 * be always printed
1207 */ 1261 */
1208 return NO_QUOTA; 1262 return -EDQUOT;
1209 } 1263 }
1210 1264
1211 return QUOTA_OK; 1265 return 0;
1212} 1266}
1213 1267
1214static int info_idq_free(struct dquot *dquot, qsize_t inodes) 1268static int info_idq_free(struct dquot *dquot, qsize_t inodes)
@@ -1242,25 +1296,32 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
1242 return QUOTA_NL_BHARDBELOW; 1296 return QUOTA_NL_BHARDBELOW;
1243 return QUOTA_NL_NOWARN; 1297 return QUOTA_NL_NOWARN;
1244} 1298}
1299
1245/* 1300/*
1246 * Initialize quota pointers in inode 1301 * Initialize quota pointers in inode
1247 * We do things in a bit complicated way but by that we avoid calling 1302 *
1248 * dqget() and thus filesystem callbacks under dqptr_sem. 1303 * We do things in a bit complicated way but by that we avoid calling
1304 * dqget() and thus filesystem callbacks under dqptr_sem.
1305 *
1306 * It is better to call this function outside of any transaction as it
1307 * might need a lot of space in journal for dquot structure allocation.
1249 */ 1308 */
1250int dquot_initialize(struct inode *inode, int type) 1309static void __dquot_initialize(struct inode *inode, int type)
1251{ 1310{
1252 unsigned int id = 0; 1311 unsigned int id = 0;
1253 int cnt, ret = 0; 1312 int cnt;
1254 struct dquot *got[MAXQUOTAS] = { NULL, NULL }; 1313 struct dquot *got[MAXQUOTAS];
1255 struct super_block *sb = inode->i_sb; 1314 struct super_block *sb = inode->i_sb;
1315 qsize_t rsv;
1256 1316
1257 /* First test before acquiring mutex - solves deadlocks when we 1317 /* First test before acquiring mutex - solves deadlocks when we
1258 * re-enter the quota code and are already holding the mutex */ 1318 * re-enter the quota code and are already holding the mutex */
1259 if (IS_NOQUOTA(inode)) 1319 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
1260 return 0; 1320 return;
1261 1321
1262 /* First get references to structures we might need. */ 1322 /* First get references to structures we might need. */
1263 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1323 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1324 got[cnt] = NULL;
1264 if (type != -1 && cnt != type) 1325 if (type != -1 && cnt != type)
1265 continue; 1326 continue;
1266 switch (cnt) { 1327 switch (cnt) {
@@ -1275,7 +1336,6 @@ int dquot_initialize(struct inode *inode, int type)
1275 } 1336 }
1276 1337
1277 down_write(&sb_dqopt(sb)->dqptr_sem); 1338 down_write(&sb_dqopt(sb)->dqptr_sem);
1278 /* Having dqptr_sem we know NOQUOTA flags can't be altered... */
1279 if (IS_NOQUOTA(inode)) 1339 if (IS_NOQUOTA(inode))
1280 goto out_err; 1340 goto out_err;
1281 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1341 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1287,20 +1347,31 @@ int dquot_initialize(struct inode *inode, int type)
1287 if (!inode->i_dquot[cnt]) { 1347 if (!inode->i_dquot[cnt]) {
1288 inode->i_dquot[cnt] = got[cnt]; 1348 inode->i_dquot[cnt] = got[cnt];
1289 got[cnt] = NULL; 1349 got[cnt] = NULL;
1350 /*
1351 * Make quota reservation system happy if someone
1352 * did a write before quota was turned on
1353 */
1354 rsv = inode_get_rsv_space(inode);
1355 if (unlikely(rsv))
1356 dquot_resv_space(inode->i_dquot[cnt], rsv);
1290 } 1357 }
1291 } 1358 }
1292out_err: 1359out_err:
1293 up_write(&sb_dqopt(sb)->dqptr_sem); 1360 up_write(&sb_dqopt(sb)->dqptr_sem);
1294 /* Drop unused references */ 1361 /* Drop unused references */
1295 dqput_all(got); 1362 dqput_all(got);
1296 return ret; 1363}
1364
1365void dquot_initialize(struct inode *inode)
1366{
1367 __dquot_initialize(inode, -1);
1297} 1368}
1298EXPORT_SYMBOL(dquot_initialize); 1369EXPORT_SYMBOL(dquot_initialize);
1299 1370
1300/* 1371/*
1301 * Release all quotas referenced by inode 1372 * Release all quotas referenced by inode
1302 */ 1373 */
1303int dquot_drop(struct inode *inode) 1374static void __dquot_drop(struct inode *inode)
1304{ 1375{
1305 int cnt; 1376 int cnt;
1306 struct dquot *put[MAXQUOTAS]; 1377 struct dquot *put[MAXQUOTAS];
@@ -1312,32 +1383,31 @@ int dquot_drop(struct inode *inode)
1312 } 1383 }
1313 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1384 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1314 dqput_all(put); 1385 dqput_all(put);
1315 return 0;
1316} 1386}
1317EXPORT_SYMBOL(dquot_drop);
1318 1387
1319/* Wrapper to remove references to quota structures from inode */ 1388void dquot_drop(struct inode *inode)
1320void vfs_dq_drop(struct inode *inode) 1389{
1321{ 1390 int cnt;
1322 /* Here we can get arbitrary inode from clear_inode() so we have 1391
1323 * to be careful. OTOH we don't need locking as quota operations 1392 if (IS_NOQUOTA(inode))
1324 * are allowed to change only at mount time */ 1393 return;
1325 if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op 1394
1326 && inode->i_sb->dq_op->drop) { 1395 /*
1327 int cnt; 1396 * Test before calling to rule out calls from proc and such
1328 /* Test before calling to rule out calls from proc and such 1397 * where we are not allowed to block. Note that this is
1329 * where we are not allowed to block. Note that this is 1398 * actually reliable test even without the lock - the caller
1330 * actually reliable test even without the lock - the caller 1399 * must assure that nobody can come after the DQUOT_DROP and
1331 * must assure that nobody can come after the DQUOT_DROP and 1400 * add quota pointers back anyway.
1332 * add quota pointers back anyway */ 1401 */
1333 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1402 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1334 if (inode->i_dquot[cnt]) 1403 if (inode->i_dquot[cnt])
1335 break; 1404 break;
1336 if (cnt < MAXQUOTAS) 1405 }
1337 inode->i_sb->dq_op->drop(inode); 1406
1338 } 1407 if (cnt < MAXQUOTAS)
1339} 1408 __dquot_drop(inode);
1340EXPORT_SYMBOL(vfs_dq_drop); 1409}
1410EXPORT_SYMBOL(dquot_drop);
1341 1411
1342/* 1412/*
1343 * inode_reserved_space is managed internally by quota, and protected by 1413 * inode_reserved_space is managed internally by quota, and protected by
@@ -1351,32 +1421,37 @@ static qsize_t *inode_reserved_space(struct inode * inode)
1351 return inode->i_sb->dq_op->get_reserved_space(inode); 1421 return inode->i_sb->dq_op->get_reserved_space(inode);
1352} 1422}
1353 1423
1354static void inode_add_rsv_space(struct inode *inode, qsize_t number) 1424void inode_add_rsv_space(struct inode *inode, qsize_t number)
1355{ 1425{
1356 spin_lock(&inode->i_lock); 1426 spin_lock(&inode->i_lock);
1357 *inode_reserved_space(inode) += number; 1427 *inode_reserved_space(inode) += number;
1358 spin_unlock(&inode->i_lock); 1428 spin_unlock(&inode->i_lock);
1359} 1429}
1430EXPORT_SYMBOL(inode_add_rsv_space);
1360 1431
1361 1432void inode_claim_rsv_space(struct inode *inode, qsize_t number)
1362static void inode_claim_rsv_space(struct inode *inode, qsize_t number)
1363{ 1433{
1364 spin_lock(&inode->i_lock); 1434 spin_lock(&inode->i_lock);
1365 *inode_reserved_space(inode) -= number; 1435 *inode_reserved_space(inode) -= number;
1366 __inode_add_bytes(inode, number); 1436 __inode_add_bytes(inode, number);
1367 spin_unlock(&inode->i_lock); 1437 spin_unlock(&inode->i_lock);
1368} 1438}
1439EXPORT_SYMBOL(inode_claim_rsv_space);
1369 1440
1370static void inode_sub_rsv_space(struct inode *inode, qsize_t number) 1441void inode_sub_rsv_space(struct inode *inode, qsize_t number)
1371{ 1442{
1372 spin_lock(&inode->i_lock); 1443 spin_lock(&inode->i_lock);
1373 *inode_reserved_space(inode) -= number; 1444 *inode_reserved_space(inode) -= number;
1374 spin_unlock(&inode->i_lock); 1445 spin_unlock(&inode->i_lock);
1375} 1446}
1447EXPORT_SYMBOL(inode_sub_rsv_space);
1376 1448
1377static qsize_t inode_get_rsv_space(struct inode *inode) 1449static qsize_t inode_get_rsv_space(struct inode *inode)
1378{ 1450{
1379 qsize_t ret; 1451 qsize_t ret;
1452
1453 if (!inode->i_sb->dq_op->get_reserved_space)
1454 return 0;
1380 spin_lock(&inode->i_lock); 1455 spin_lock(&inode->i_lock);
1381 ret = *inode_reserved_space(inode); 1456 ret = *inode_reserved_space(inode);
1382 spin_unlock(&inode->i_lock); 1457 spin_unlock(&inode->i_lock);
@@ -1401,38 +1476,34 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
1401} 1476}
1402 1477
1403/* 1478/*
1404 * Following four functions update i_blocks+i_bytes fields and 1479 * This functions updates i_blocks+i_bytes fields and quota information
1405 * quota information (together with appropriate checks) 1480 * (together with appropriate checks).
1406 * NOTE: We absolutely rely on the fact that caller dirties 1481 *
1407 * the inode (usually macros in quotaops.h care about this) and 1482 * NOTE: We absolutely rely on the fact that caller dirties the inode
1408 * holds a handle for the current transaction so that dquot write and 1483 * (usually helpers in quotaops.h care about this) and holds a handle for
1409 * inode write go into the same transaction. 1484 * the current transaction so that dquot write and inode write go into the
1485 * same transaction.
1410 */ 1486 */
1411 1487
1412/* 1488/*
1413 * This operation can block, but only after everything is updated 1489 * This operation can block, but only after everything is updated
1414 */ 1490 */
1415int __dquot_alloc_space(struct inode *inode, qsize_t number, 1491int __dquot_alloc_space(struct inode *inode, qsize_t number,
1416 int warn, int reserve) 1492 int warn, int reserve)
1417{ 1493{
1418 int cnt, ret = QUOTA_OK; 1494 int cnt, ret = 0;
1419 char warntype[MAXQUOTAS]; 1495 char warntype[MAXQUOTAS];
1420 1496
1421 /* 1497 /*
1422 * First test before acquiring mutex - solves deadlocks when we 1498 * First test before acquiring mutex - solves deadlocks when we
1423 * re-enter the quota code and are already holding the mutex 1499 * re-enter the quota code and are already holding the mutex
1424 */ 1500 */
1425 if (IS_NOQUOTA(inode)) { 1501 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
1426 inode_incr_space(inode, number, reserve); 1502 inode_incr_space(inode, number, reserve);
1427 goto out; 1503 goto out;
1428 } 1504 }
1429 1505
1430 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1506 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1431 if (IS_NOQUOTA(inode)) {
1432 inode_incr_space(inode, number, reserve);
1433 goto out_unlock;
1434 }
1435
1436 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1507 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1437 warntype[cnt] = QUOTA_NL_NOWARN; 1508 warntype[cnt] = QUOTA_NL_NOWARN;
1438 1509
@@ -1440,9 +1511,9 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
1440 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1511 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1441 if (!inode->i_dquot[cnt]) 1512 if (!inode->i_dquot[cnt])
1442 continue; 1513 continue;
1443 if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt) 1514 ret = check_bdq(inode->i_dquot[cnt], number, !warn,
1444 == NO_QUOTA) { 1515 warntype+cnt);
1445 ret = NO_QUOTA; 1516 if (ret) {
1446 spin_unlock(&dq_data_lock); 1517 spin_unlock(&dq_data_lock);
1447 goto out_flush_warn; 1518 goto out_flush_warn;
1448 } 1519 }
@@ -1463,61 +1534,45 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
1463 mark_all_dquot_dirty(inode->i_dquot); 1534 mark_all_dquot_dirty(inode->i_dquot);
1464out_flush_warn: 1535out_flush_warn:
1465 flush_warnings(inode->i_dquot, warntype); 1536 flush_warnings(inode->i_dquot, warntype);
1466out_unlock:
1467 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1537 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1468out: 1538out:
1469 return ret; 1539 return ret;
1470} 1540}
1471 1541EXPORT_SYMBOL(__dquot_alloc_space);
1472int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
1473{
1474 return __dquot_alloc_space(inode, number, warn, 0);
1475}
1476EXPORT_SYMBOL(dquot_alloc_space);
1477
1478int dquot_reserve_space(struct inode *inode, qsize_t number, int warn)
1479{
1480 return __dquot_alloc_space(inode, number, warn, 1);
1481}
1482EXPORT_SYMBOL(dquot_reserve_space);
1483 1542
1484/* 1543/*
1485 * This operation can block, but only after everything is updated 1544 * This operation can block, but only after everything is updated
1486 */ 1545 */
1487int dquot_alloc_inode(const struct inode *inode, qsize_t number) 1546int dquot_alloc_inode(const struct inode *inode)
1488{ 1547{
1489 int cnt, ret = NO_QUOTA; 1548 int cnt, ret = 0;
1490 char warntype[MAXQUOTAS]; 1549 char warntype[MAXQUOTAS];
1491 1550
1492 /* First test before acquiring mutex - solves deadlocks when we 1551 /* First test before acquiring mutex - solves deadlocks when we
1493 * re-enter the quota code and are already holding the mutex */ 1552 * re-enter the quota code and are already holding the mutex */
1494 if (IS_NOQUOTA(inode)) 1553 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
1495 return QUOTA_OK; 1554 return 0;
1496 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1555 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1497 warntype[cnt] = QUOTA_NL_NOWARN; 1556 warntype[cnt] = QUOTA_NL_NOWARN;
1498 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1557 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1499 if (IS_NOQUOTA(inode)) {
1500 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1501 return QUOTA_OK;
1502 }
1503 spin_lock(&dq_data_lock); 1558 spin_lock(&dq_data_lock);
1504 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1559 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1505 if (!inode->i_dquot[cnt]) 1560 if (!inode->i_dquot[cnt])
1506 continue; 1561 continue;
1507 if (check_idq(inode->i_dquot[cnt], number, warntype+cnt) 1562 ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt);
1508 == NO_QUOTA) 1563 if (ret)
1509 goto warn_put_all; 1564 goto warn_put_all;
1510 } 1565 }
1511 1566
1512 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1567 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1513 if (!inode->i_dquot[cnt]) 1568 if (!inode->i_dquot[cnt])
1514 continue; 1569 continue;
1515 dquot_incr_inodes(inode->i_dquot[cnt], number); 1570 dquot_incr_inodes(inode->i_dquot[cnt], 1);
1516 } 1571 }
1517 ret = QUOTA_OK; 1572
1518warn_put_all: 1573warn_put_all:
1519 spin_unlock(&dq_data_lock); 1574 spin_unlock(&dq_data_lock);
1520 if (ret == QUOTA_OK) 1575 if (ret == 0)
1521 mark_all_dquot_dirty(inode->i_dquot); 1576 mark_all_dquot_dirty(inode->i_dquot);
1522 flush_warnings(inode->i_dquot, warntype); 1577 flush_warnings(inode->i_dquot, warntype);
1523 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1578 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1525,23 +1580,19 @@ warn_put_all:
1525} 1580}
1526EXPORT_SYMBOL(dquot_alloc_inode); 1581EXPORT_SYMBOL(dquot_alloc_inode);
1527 1582
1528int dquot_claim_space(struct inode *inode, qsize_t number) 1583/*
1584 * Convert in-memory reserved quotas to real consumed quotas
1585 */
1586int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
1529{ 1587{
1530 int cnt; 1588 int cnt;
1531 int ret = QUOTA_OK;
1532 1589
1533 if (IS_NOQUOTA(inode)) { 1590 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
1534 inode_claim_rsv_space(inode, number); 1591 inode_claim_rsv_space(inode, number);
1535 goto out; 1592 return 0;
1536 } 1593 }
1537 1594
1538 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1595 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1539 if (IS_NOQUOTA(inode)) {
1540 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1541 inode_claim_rsv_space(inode, number);
1542 goto out;
1543 }
1544
1545 spin_lock(&dq_data_lock); 1596 spin_lock(&dq_data_lock);
1546 /* Claim reserved quotas to allocated quotas */ 1597 /* Claim reserved quotas to allocated quotas */
1547 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1598 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1554,33 +1605,26 @@ int dquot_claim_space(struct inode *inode, qsize_t number)
1554 spin_unlock(&dq_data_lock); 1605 spin_unlock(&dq_data_lock);
1555 mark_all_dquot_dirty(inode->i_dquot); 1606 mark_all_dquot_dirty(inode->i_dquot);
1556 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1607 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1557out: 1608 return 0;
1558 return ret;
1559} 1609}
1560EXPORT_SYMBOL(dquot_claim_space); 1610EXPORT_SYMBOL(dquot_claim_space_nodirty);
1561 1611
1562/* 1612/*
1563 * This operation can block, but only after everything is updated 1613 * This operation can block, but only after everything is updated
1564 */ 1614 */
1565int __dquot_free_space(struct inode *inode, qsize_t number, int reserve) 1615void __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
1566{ 1616{
1567 unsigned int cnt; 1617 unsigned int cnt;
1568 char warntype[MAXQUOTAS]; 1618 char warntype[MAXQUOTAS];
1569 1619
1570 /* First test before acquiring mutex - solves deadlocks when we 1620 /* First test before acquiring mutex - solves deadlocks when we
1571 * re-enter the quota code and are already holding the mutex */ 1621 * re-enter the quota code and are already holding the mutex */
1572 if (IS_NOQUOTA(inode)) { 1622 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
1573out_sub:
1574 inode_decr_space(inode, number, reserve); 1623 inode_decr_space(inode, number, reserve);
1575 return QUOTA_OK; 1624 return;
1576 } 1625 }
1577 1626
1578 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1627 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1579 /* Now recheck reliably when holding dqptr_sem */
1580 if (IS_NOQUOTA(inode)) {
1581 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1582 goto out_sub;
1583 }
1584 spin_lock(&dq_data_lock); 1628 spin_lock(&dq_data_lock);
1585 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1629 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1586 if (!inode->i_dquot[cnt]) 1630 if (!inode->i_dquot[cnt])
@@ -1600,56 +1644,34 @@ out_sub:
1600out_unlock: 1644out_unlock:
1601 flush_warnings(inode->i_dquot, warntype); 1645 flush_warnings(inode->i_dquot, warntype);
1602 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1646 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1603 return QUOTA_OK;
1604}
1605
1606int dquot_free_space(struct inode *inode, qsize_t number)
1607{
1608 return __dquot_free_space(inode, number, 0);
1609}
1610EXPORT_SYMBOL(dquot_free_space);
1611
1612/*
1613 * Release reserved quota space
1614 */
1615void dquot_release_reserved_space(struct inode *inode, qsize_t number)
1616{
1617 __dquot_free_space(inode, number, 1);
1618
1619} 1647}
1620EXPORT_SYMBOL(dquot_release_reserved_space); 1648EXPORT_SYMBOL(__dquot_free_space);
1621 1649
1622/* 1650/*
1623 * This operation can block, but only after everything is updated 1651 * This operation can block, but only after everything is updated
1624 */ 1652 */
1625int dquot_free_inode(const struct inode *inode, qsize_t number) 1653void dquot_free_inode(const struct inode *inode)
1626{ 1654{
1627 unsigned int cnt; 1655 unsigned int cnt;
1628 char warntype[MAXQUOTAS]; 1656 char warntype[MAXQUOTAS];
1629 1657
1630 /* First test before acquiring mutex - solves deadlocks when we 1658 /* First test before acquiring mutex - solves deadlocks when we
1631 * re-enter the quota code and are already holding the mutex */ 1659 * re-enter the quota code and are already holding the mutex */
1632 if (IS_NOQUOTA(inode)) 1660 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
1633 return QUOTA_OK; 1661 return;
1634 1662
1635 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1663 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1636 /* Now recheck reliably when holding dqptr_sem */
1637 if (IS_NOQUOTA(inode)) {
1638 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1639 return QUOTA_OK;
1640 }
1641 spin_lock(&dq_data_lock); 1664 spin_lock(&dq_data_lock);
1642 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1665 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1643 if (!inode->i_dquot[cnt]) 1666 if (!inode->i_dquot[cnt])
1644 continue; 1667 continue;
1645 warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number); 1668 warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1);
1646 dquot_decr_inodes(inode->i_dquot[cnt], number); 1669 dquot_decr_inodes(inode->i_dquot[cnt], 1);
1647 } 1670 }
1648 spin_unlock(&dq_data_lock); 1671 spin_unlock(&dq_data_lock);
1649 mark_all_dquot_dirty(inode->i_dquot); 1672 mark_all_dquot_dirty(inode->i_dquot);
1650 flush_warnings(inode->i_dquot, warntype); 1673 flush_warnings(inode->i_dquot, warntype);
1651 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1674 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
1652 return QUOTA_OK;
1653} 1675}
1654EXPORT_SYMBOL(dquot_free_inode); 1676EXPORT_SYMBOL(dquot_free_inode);
1655 1677
@@ -1659,37 +1681,31 @@ EXPORT_SYMBOL(dquot_free_inode);
1659 * This operation can block, but only after everything is updated 1681 * This operation can block, but only after everything is updated
1660 * A transaction must be started when entering this function. 1682 * A transaction must be started when entering this function.
1661 */ 1683 */
1662int dquot_transfer(struct inode *inode, struct iattr *iattr) 1684static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask)
1663{ 1685{
1664 qsize_t space, cur_space; 1686 qsize_t space, cur_space;
1665 qsize_t rsv_space = 0; 1687 qsize_t rsv_space = 0;
1666 struct dquot *transfer_from[MAXQUOTAS]; 1688 struct dquot *transfer_from[MAXQUOTAS];
1667 struct dquot *transfer_to[MAXQUOTAS]; 1689 struct dquot *transfer_to[MAXQUOTAS];
1668 int cnt, ret = QUOTA_OK; 1690 int cnt, ret = 0;
1669 int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid,
1670 chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid;
1671 char warntype_to[MAXQUOTAS]; 1691 char warntype_to[MAXQUOTAS];
1672 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; 1692 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
1673 1693
1674 /* First test before acquiring mutex - solves deadlocks when we 1694 /* First test before acquiring mutex - solves deadlocks when we
1675 * re-enter the quota code and are already holding the mutex */ 1695 * re-enter the quota code and are already holding the mutex */
1676 if (IS_NOQUOTA(inode)) 1696 if (IS_NOQUOTA(inode))
1677 return QUOTA_OK; 1697 return 0;
1678 /* Initialize the arrays */ 1698 /* Initialize the arrays */
1679 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1699 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1680 transfer_from[cnt] = NULL; 1700 transfer_from[cnt] = NULL;
1681 transfer_to[cnt] = NULL; 1701 transfer_to[cnt] = NULL;
1682 warntype_to[cnt] = QUOTA_NL_NOWARN; 1702 warntype_to[cnt] = QUOTA_NL_NOWARN;
1683 } 1703 }
1684 if (chuid) 1704 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1685 transfer_to[USRQUOTA] = dqget(inode->i_sb, iattr->ia_uid, 1705 if (mask & (1 << cnt))
1686 USRQUOTA); 1706 transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt);
1687 if (chgid) 1707 }
1688 transfer_to[GRPQUOTA] = dqget(inode->i_sb, iattr->ia_gid,
1689 GRPQUOTA);
1690
1691 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1708 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1692 /* Now recheck reliably when holding dqptr_sem */
1693 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ 1709 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
1694 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1710 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1695 goto put_all; 1711 goto put_all;
@@ -1703,9 +1719,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1703 if (!transfer_to[cnt]) 1719 if (!transfer_to[cnt])
1704 continue; 1720 continue;
1705 transfer_from[cnt] = inode->i_dquot[cnt]; 1721 transfer_from[cnt] = inode->i_dquot[cnt];
1706 if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) == 1722 ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt);
1707 NO_QUOTA || check_bdq(transfer_to[cnt], space, 0, 1723 if (ret)
1708 warntype_to + cnt) == NO_QUOTA) 1724 goto over_quota;
1725 ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt);
1726 if (ret)
1709 goto over_quota; 1727 goto over_quota;
1710 } 1728 }
1711 1729
@@ -1759,22 +1777,32 @@ over_quota:
1759 /* Clear dquot pointers we don't want to dqput() */ 1777 /* Clear dquot pointers we don't want to dqput() */
1760 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1778 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1761 transfer_from[cnt] = NULL; 1779 transfer_from[cnt] = NULL;
1762 ret = NO_QUOTA;
1763 goto warn_put_all; 1780 goto warn_put_all;
1764} 1781}
1765EXPORT_SYMBOL(dquot_transfer);
1766 1782
1767/* Wrapper for transferring ownership of an inode */ 1783/* Wrapper for transferring ownership of an inode for uid/gid only
1768int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) 1784 * Called from FSXXX_setattr()
1785 */
1786int dquot_transfer(struct inode *inode, struct iattr *iattr)
1769{ 1787{
1788 qid_t chid[MAXQUOTAS];
1789 unsigned long mask = 0;
1790
1791 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) {
1792 mask |= 1 << USRQUOTA;
1793 chid[USRQUOTA] = iattr->ia_uid;
1794 }
1795 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) {
1796 mask |= 1 << GRPQUOTA;
1797 chid[GRPQUOTA] = iattr->ia_gid;
1798 }
1770 if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) { 1799 if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
1771 vfs_dq_init(inode); 1800 dquot_initialize(inode);
1772 if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA) 1801 return __dquot_transfer(inode, chid, mask);
1773 return 1;
1774 } 1802 }
1775 return 0; 1803 return 0;
1776} 1804}
1777EXPORT_SYMBOL(vfs_dq_transfer); 1805EXPORT_SYMBOL(dquot_transfer);
1778 1806
1779/* 1807/*
1780 * Write info of quota file to disk 1808 * Write info of quota file to disk
@@ -1795,13 +1823,6 @@ EXPORT_SYMBOL(dquot_commit_info);
1795 * Definitions of diskquota operations. 1823 * Definitions of diskquota operations.
1796 */ 1824 */
1797const struct dquot_operations dquot_operations = { 1825const struct dquot_operations dquot_operations = {
1798 .initialize = dquot_initialize,
1799 .drop = dquot_drop,
1800 .alloc_space = dquot_alloc_space,
1801 .alloc_inode = dquot_alloc_inode,
1802 .free_space = dquot_free_space,
1803 .free_inode = dquot_free_inode,
1804 .transfer = dquot_transfer,
1805 .write_dquot = dquot_commit, 1826 .write_dquot = dquot_commit,
1806 .acquire_dquot = dquot_acquire, 1827 .acquire_dquot = dquot_acquire,
1807 .release_dquot = dquot_release, 1828 .release_dquot = dquot_release,
@@ -1812,6 +1833,20 @@ const struct dquot_operations dquot_operations = {
1812}; 1833};
1813 1834
1814/* 1835/*
1836 * Generic helper for ->open on filesystems supporting disk quotas.
1837 */
1838int dquot_file_open(struct inode *inode, struct file *file)
1839{
1840 int error;
1841
1842 error = generic_file_open(inode, file);
1843 if (!error && (file->f_mode & FMODE_WRITE))
1844 dquot_initialize(inode);
1845 return error;
1846}
1847EXPORT_SYMBOL(dquot_file_open);
1848
1849/*
1815 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) 1850 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
1816 */ 1851 */
1817int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags) 1852int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
@@ -1990,11 +2025,13 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
1990 } 2025 }
1991 2026
1992 if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { 2027 if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
1993 /* As we bypass the pagecache we must now flush the inode so 2028 /* As we bypass the pagecache we must now flush all the
1994 * that we see all the changes from userspace... */ 2029 * dirty data and invalidate caches so that kernel sees
1995 write_inode_now(inode, 1); 2030 * changes from userspace. It is not enough to just flush
1996 /* And now flush the block cache so that kernel sees the 2031 * the quota file since if blocksize < pagesize, invalidation
1997 * changes */ 2032 * of the cache could fail because of other unrelated dirty
2033 * data */
2034 sync_filesystem(sb);
1998 invalidate_bdev(sb->s_bdev); 2035 invalidate_bdev(sb->s_bdev);
1999 } 2036 }
2000 mutex_lock(&dqopt->dqonoff_mutex); 2037 mutex_lock(&dqopt->dqonoff_mutex);
@@ -2007,14 +2044,16 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
2007 /* We don't want quota and atime on quota files (deadlocks 2044 /* We don't want quota and atime on quota files (deadlocks
2008 * possible) Also nobody should write to the file - we use 2045 * possible) Also nobody should write to the file - we use
2009 * special IO operations which ignore the immutable bit. */ 2046 * special IO operations which ignore the immutable bit. */
2010 down_write(&dqopt->dqptr_sem);
2011 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); 2047 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2012 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | 2048 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
2013 S_NOQUOTA); 2049 S_NOQUOTA);
2014 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; 2050 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
2015 mutex_unlock(&inode->i_mutex); 2051 mutex_unlock(&inode->i_mutex);
2016 up_write(&dqopt->dqptr_sem); 2052 /*
2017 sb->dq_op->drop(inode); 2053 * When S_NOQUOTA is set, remove dquot references as no more
2054 * references can be added
2055 */
2056 __dquot_drop(inode);
2018 } 2057 }
2019 2058
2020 error = -EIO; 2059 error = -EIO;
@@ -2050,14 +2089,12 @@ out_file_init:
2050 iput(inode); 2089 iput(inode);
2051out_lock: 2090out_lock:
2052 if (oldflags != -1) { 2091 if (oldflags != -1) {
2053 down_write(&dqopt->dqptr_sem);
2054 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); 2092 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2055 /* Set the flags back (in the case of accidental quotaon() 2093 /* Set the flags back (in the case of accidental quotaon()
2056 * on a wrong file we don't want to mess up the flags) */ 2094 * on a wrong file we don't want to mess up the flags) */
2057 inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); 2095 inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
2058 inode->i_flags |= oldflags; 2096 inode->i_flags |= oldflags;
2059 mutex_unlock(&inode->i_mutex); 2097 mutex_unlock(&inode->i_mutex);
2060 up_write(&dqopt->dqptr_sem);
2061 } 2098 }
2062 mutex_unlock(&dqopt->dqonoff_mutex); 2099 mutex_unlock(&dqopt->dqonoff_mutex);
2063out_fmt: 2100out_fmt:
@@ -2289,34 +2326,34 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
2289 if (di->dqb_valid & QIF_SPACE) { 2326 if (di->dqb_valid & QIF_SPACE) {
2290 dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace; 2327 dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace;
2291 check_blim = 1; 2328 check_blim = 1;
2292 __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); 2329 set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
2293 } 2330 }
2294 if (di->dqb_valid & QIF_BLIMITS) { 2331 if (di->dqb_valid & QIF_BLIMITS) {
2295 dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit); 2332 dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
2296 dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit); 2333 dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
2297 check_blim = 1; 2334 check_blim = 1;
2298 __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); 2335 set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
2299 } 2336 }
2300 if (di->dqb_valid & QIF_INODES) { 2337 if (di->dqb_valid & QIF_INODES) {
2301 dm->dqb_curinodes = di->dqb_curinodes; 2338 dm->dqb_curinodes = di->dqb_curinodes;
2302 check_ilim = 1; 2339 check_ilim = 1;
2303 __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); 2340 set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
2304 } 2341 }
2305 if (di->dqb_valid & QIF_ILIMITS) { 2342 if (di->dqb_valid & QIF_ILIMITS) {
2306 dm->dqb_isoftlimit = di->dqb_isoftlimit; 2343 dm->dqb_isoftlimit = di->dqb_isoftlimit;
2307 dm->dqb_ihardlimit = di->dqb_ihardlimit; 2344 dm->dqb_ihardlimit = di->dqb_ihardlimit;
2308 check_ilim = 1; 2345 check_ilim = 1;
2309 __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); 2346 set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
2310 } 2347 }
2311 if (di->dqb_valid & QIF_BTIME) { 2348 if (di->dqb_valid & QIF_BTIME) {
2312 dm->dqb_btime = di->dqb_btime; 2349 dm->dqb_btime = di->dqb_btime;
2313 check_blim = 1; 2350 check_blim = 1;
2314 __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); 2351 set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
2315 } 2352 }
2316 if (di->dqb_valid & QIF_ITIME) { 2353 if (di->dqb_valid & QIF_ITIME) {
2317 dm->dqb_itime = di->dqb_itime; 2354 dm->dqb_itime = di->dqb_itime;
2318 check_ilim = 1; 2355 check_ilim = 1;
2319 __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); 2356 set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
2320 } 2357 }
2321 2358
2322 if (check_blim) { 2359 if (check_blim) {
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
new file mode 100644
index 000000000000..d67908b407d9
--- /dev/null
+++ b/fs/quota/netlink.c
@@ -0,0 +1,96 @@
1
2#include <linux/cred.h>
3#include <linux/init.h>
4#include <linux/module.h>
5#include <linux/kernel.h>
6#include <linux/quotaops.h>
7#include <linux/sched.h>
8#include <linux/slab.h>
9#include <net/netlink.h>
10#include <net/genetlink.h>
11
12/* Netlink family structure for quota */
13static struct genl_family quota_genl_family = {
14 .id = GENL_ID_GENERATE,
15 .hdrsize = 0,
16 .name = "VFS_DQUOT",
17 .version = 1,
18 .maxattr = QUOTA_NL_A_MAX,
19};
20
21/**
22 * quota_send_warning - Send warning to userspace about exceeded quota
23 * @type: The quota type: USRQQUOTA, GRPQUOTA,...
24 * @id: The user or group id of the quota that was exceeded
25 * @dev: The device on which the fs is mounted (sb->s_dev)
26 * @warntype: The type of the warning: QUOTA_NL_...
27 *
28 * This can be used by filesystems (including those which don't use
29 * dquot) to send a message to userspace relating to quota limits.
30 *
31 */
32
33void quota_send_warning(short type, unsigned int id, dev_t dev,
34 const char warntype)
35{
36 static atomic_t seq;
37 struct sk_buff *skb;
38 void *msg_head;
39 int ret;
40 int msg_size = 4 * nla_total_size(sizeof(u32)) +
41 2 * nla_total_size(sizeof(u64));
42
43 /* We have to allocate using GFP_NOFS as we are called from a
44 * filesystem performing write and thus further recursion into
45 * the fs to free some data could cause deadlocks. */
46 skb = genlmsg_new(msg_size, GFP_NOFS);
47 if (!skb) {
48 printk(KERN_ERR
49 "VFS: Not enough memory to send quota warning.\n");
50 return;
51 }
52 msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
53 &quota_genl_family, 0, QUOTA_NL_C_WARNING);
54 if (!msg_head) {
55 printk(KERN_ERR
56 "VFS: Cannot store netlink header in quota warning.\n");
57 goto err_out;
58 }
59 ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
60 if (ret)
61 goto attr_err_out;
62 ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
63 if (ret)
64 goto attr_err_out;
65 ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
66 if (ret)
67 goto attr_err_out;
68 ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
69 if (ret)
70 goto attr_err_out;
71 ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
72 if (ret)
73 goto attr_err_out;
74 ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
75 if (ret)
76 goto attr_err_out;
77 genlmsg_end(skb, msg_head);
78
79 genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
80 return;
81attr_err_out:
82 printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
83err_out:
84 kfree_skb(skb);
85}
86EXPORT_SYMBOL(quota_send_warning);
87
88static int __init quota_init(void)
89{
90 if (genl_register_family(&quota_genl_family) != 0)
91 printk(KERN_ERR
92 "VFS: Failed to create quota netlink interface.\n");
93 return 0;
94};
95
96module_init(quota_init);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ee91e2756950..95388f9b7356 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -10,7 +10,6 @@
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <asm/current.h> 11#include <asm/current.h>
12#include <asm/uaccess.h> 12#include <asm/uaccess.h>
13#include <linux/compat.h>
14#include <linux/kernel.h> 13#include <linux/kernel.h>
15#include <linux/security.h> 14#include <linux/security.h>
16#include <linux/syscalls.h> 15#include <linux/syscalls.h>
@@ -18,220 +17,205 @@
18#include <linux/capability.h> 17#include <linux/capability.h>
19#include <linux/quotaops.h> 18#include <linux/quotaops.h>
20#include <linux/types.h> 19#include <linux/types.h>
21#include <net/netlink.h> 20#include <linux/writeback.h>
22#include <net/genetlink.h>
23 21
24/* Check validity of generic quotactl commands */ 22static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
25static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, 23 qid_t id)
26 qid_t id)
27{ 24{
28 if (type >= MAXQUOTAS)
29 return -EINVAL;
30 if (!sb && cmd != Q_SYNC)
31 return -ENODEV;
32 /* Is operation supported? */
33 if (sb && !sb->s_qcop)
34 return -ENOSYS;
35
36 switch (cmd) { 25 switch (cmd) {
37 case Q_GETFMT: 26 /* these commands do not require any special privilegues */
38 break; 27 case Q_GETFMT:
39 case Q_QUOTAON: 28 case Q_SYNC:
40 if (!sb->s_qcop->quota_on) 29 case Q_GETINFO:
41 return -ENOSYS; 30 case Q_XGETQSTAT:
42 break; 31 case Q_XQUOTASYNC:
43 case Q_QUOTAOFF: 32 break;
44 if (!sb->s_qcop->quota_off) 33 /* allow to query information for dquots we "own" */
45 return -ENOSYS; 34 case Q_GETQUOTA:
46 break; 35 case Q_XGETQUOTA:
47 case Q_SETINFO: 36 if ((type == USRQUOTA && current_euid() == id) ||
48 if (!sb->s_qcop->set_info) 37 (type == GRPQUOTA && in_egroup_p(id)))
49 return -ENOSYS;
50 break;
51 case Q_GETINFO:
52 if (!sb->s_qcop->get_info)
53 return -ENOSYS;
54 break;
55 case Q_SETQUOTA:
56 if (!sb->s_qcop->set_dqblk)
57 return -ENOSYS;
58 break;
59 case Q_GETQUOTA:
60 if (!sb->s_qcop->get_dqblk)
61 return -ENOSYS;
62 break;
63 case Q_SYNC:
64 if (sb && !sb->s_qcop->quota_sync)
65 return -ENOSYS;
66 break; 38 break;
67 default: 39 /*FALLTHROUGH*/
68 return -EINVAL; 40 default:
41 if (!capable(CAP_SYS_ADMIN))
42 return -EPERM;
69 } 43 }
70 44
71 /* Is quota turned on for commands which need it? */ 45 return security_quotactl(cmd, type, id, sb);
72 switch (cmd) { 46}
73 case Q_GETFMT:
74 case Q_GETINFO:
75 case Q_SETINFO:
76 case Q_SETQUOTA:
77 case Q_GETQUOTA:
78 /* This is just an informative test so we are satisfied
79 * without the lock */
80 if (!sb_has_quota_active(sb, type))
81 return -ESRCH;
82 }
83 47
84 /* Check privileges */ 48static int quota_sync_all(int type)
85 if (cmd == Q_GETQUOTA) { 49{
86 if (((type == USRQUOTA && current_euid() != id) || 50 struct super_block *sb;
87 (type == GRPQUOTA && !in_egroup_p(id))) && 51 int ret;
88 !capable(CAP_SYS_ADMIN)) 52
89 return -EPERM; 53 if (type >= MAXQUOTAS)
54 return -EINVAL;
55 ret = security_quotactl(Q_SYNC, type, 0, NULL);
56 if (ret)
57 return ret;
58
59 spin_lock(&sb_lock);
60restart:
61 list_for_each_entry(sb, &super_blocks, s_list) {
62 if (!sb->s_qcop || !sb->s_qcop->quota_sync)
63 continue;
64
65 sb->s_count++;
66 spin_unlock(&sb_lock);
67 down_read(&sb->s_umount);
68 if (sb->s_root)
69 sb->s_qcop->quota_sync(sb, type, 1);
70 up_read(&sb->s_umount);
71 spin_lock(&sb_lock);
72 if (__put_super_and_need_restart(sb))
73 goto restart;
90 } 74 }
91 else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO) 75 spin_unlock(&sb_lock);
92 if (!capable(CAP_SYS_ADMIN))
93 return -EPERM;
94 76
95 return 0; 77 return 0;
96} 78}
97 79
98/* Check validity of XFS Quota Manager commands */ 80static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
99static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, 81 void __user *addr)
100 qid_t id)
101{ 82{
102 if (type >= XQM_MAXQUOTAS) 83 char *pathname;
103 return -EINVAL; 84 int ret = -ENOSYS;
104 if (!sb) 85
105 return -ENODEV; 86 pathname = getname(addr);
106 if (!sb->s_qcop) 87 if (IS_ERR(pathname))
107 return -ENOSYS; 88 return PTR_ERR(pathname);
89 if (sb->s_qcop->quota_on)
90 ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
91 putname(pathname);
92 return ret;
93}
108 94
109 switch (cmd) { 95static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
110 case Q_XQUOTAON: 96{
111 case Q_XQUOTAOFF: 97 __u32 fmt;
112 case Q_XQUOTARM:
113 if (!sb->s_qcop->set_xstate)
114 return -ENOSYS;
115 break;
116 case Q_XGETQSTAT:
117 if (!sb->s_qcop->get_xstate)
118 return -ENOSYS;
119 break;
120 case Q_XSETQLIM:
121 if (!sb->s_qcop->set_xquota)
122 return -ENOSYS;
123 break;
124 case Q_XGETQUOTA:
125 if (!sb->s_qcop->get_xquota)
126 return -ENOSYS;
127 break;
128 case Q_XQUOTASYNC:
129 if (!sb->s_qcop->quota_sync)
130 return -ENOSYS;
131 break;
132 default:
133 return -EINVAL;
134 }
135 98
136 /* Check privileges */ 99 down_read(&sb_dqopt(sb)->dqptr_sem);
137 if (cmd == Q_XGETQUOTA) { 100 if (!sb_has_quota_active(sb, type)) {
138 if (((type == XQM_USRQUOTA && current_euid() != id) || 101 up_read(&sb_dqopt(sb)->dqptr_sem);
139 (type == XQM_GRPQUOTA && !in_egroup_p(id))) && 102 return -ESRCH;
140 !capable(CAP_SYS_ADMIN))
141 return -EPERM;
142 } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) {
143 if (!capable(CAP_SYS_ADMIN))
144 return -EPERM;
145 } 103 }
104 fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
105 up_read(&sb_dqopt(sb)->dqptr_sem);
106 if (copy_to_user(addr, &fmt, sizeof(fmt)))
107 return -EFAULT;
108 return 0;
109}
146 110
111static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
112{
113 struct if_dqinfo info;
114 int ret;
115
116 if (!sb_has_quota_active(sb, type))
117 return -ESRCH;
118 if (!sb->s_qcop->get_info)
119 return -ENOSYS;
120 ret = sb->s_qcop->get_info(sb, type, &info);
121 if (!ret && copy_to_user(addr, &info, sizeof(info)))
122 return -EFAULT;
123 return ret;
124}
125
126static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
127{
128 struct if_dqinfo info;
129
130 if (copy_from_user(&info, addr, sizeof(info)))
131 return -EFAULT;
132 if (!sb_has_quota_active(sb, type))
133 return -ESRCH;
134 if (!sb->s_qcop->set_info)
135 return -ENOSYS;
136 return sb->s_qcop->set_info(sb, type, &info);
137}
138
139static int quota_getquota(struct super_block *sb, int type, qid_t id,
140 void __user *addr)
141{
142 struct if_dqblk idq;
143 int ret;
144
145 if (!sb_has_quota_active(sb, type))
146 return -ESRCH;
147 if (!sb->s_qcop->get_dqblk)
148 return -ENOSYS;
149 ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
150 if (ret)
151 return ret;
152 if (copy_to_user(addr, &idq, sizeof(idq)))
153 return -EFAULT;
147 return 0; 154 return 0;
148} 155}
149 156
150static int check_quotactl_valid(struct super_block *sb, int type, int cmd, 157static int quota_setquota(struct super_block *sb, int type, qid_t id,
151 qid_t id) 158 void __user *addr)
152{ 159{
153 int error; 160 struct if_dqblk idq;
154 161
155 if (XQM_COMMAND(cmd)) 162 if (copy_from_user(&idq, addr, sizeof(idq)))
156 error = xqm_quotactl_valid(sb, type, cmd, id); 163 return -EFAULT;
157 else 164 if (!sb_has_quota_active(sb, type))
158 error = generic_quotactl_valid(sb, type, cmd, id); 165 return -ESRCH;
159 if (!error) 166 if (!sb->s_qcop->set_dqblk)
160 error = security_quotactl(cmd, type, id, sb); 167 return -ENOSYS;
161 return error; 168 return sb->s_qcop->set_dqblk(sb, type, id, &idq);
162} 169}
163 170
164#ifdef CONFIG_QUOTA 171static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
165void sync_quota_sb(struct super_block *sb, int type)
166{ 172{
167 int cnt; 173 __u32 flags;
168 174
169 if (!sb->s_qcop->quota_sync) 175 if (copy_from_user(&flags, addr, sizeof(flags)))
170 return; 176 return -EFAULT;
177 if (!sb->s_qcop->set_xstate)
178 return -ENOSYS;
179 return sb->s_qcop->set_xstate(sb, flags, cmd);
180}
171 181
172 sb->s_qcop->quota_sync(sb, type); 182static int quota_getxstate(struct super_block *sb, void __user *addr)
183{
184 struct fs_quota_stat fqs;
185 int ret;
173 186
174 if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE) 187 if (!sb->s_qcop->get_xstate)
175 return; 188 return -ENOSYS;
176 /* This is not very clever (and fast) but currently I don't know about 189 ret = sb->s_qcop->get_xstate(sb, &fqs);
177 * any other simple way of getting quota data to disk and we must get 190 if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
178 * them there for userspace to be visible... */ 191 return -EFAULT;
179 if (sb->s_op->sync_fs) 192 return ret;
180 sb->s_op->sync_fs(sb, 1); 193}
181 sync_blockdev(sb->s_bdev);
182 194
183 /* 195static int quota_setxquota(struct super_block *sb, int type, qid_t id,
184 * Now when everything is written we can discard the pagecache so 196 void __user *addr)
185 * that userspace sees the changes. 197{
186 */ 198 struct fs_disk_quota fdq;
187 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 199
188 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 200 if (copy_from_user(&fdq, addr, sizeof(fdq)))
189 if (type != -1 && cnt != type) 201 return -EFAULT;
190 continue; 202 if (!sb->s_qcop->set_xquota)
191 if (!sb_has_quota_active(sb, cnt)) 203 return -ENOSYS;
192 continue; 204 return sb->s_qcop->set_xquota(sb, type, id, &fdq);
193 mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
194 I_MUTEX_QUOTA);
195 truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
196 mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
197 }
198 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
199} 205}
200#endif
201 206
202static void sync_dquots(int type) 207static int quota_getxquota(struct super_block *sb, int type, qid_t id,
208 void __user *addr)
203{ 209{
204 struct super_block *sb; 210 struct fs_disk_quota fdq;
205 int cnt; 211 int ret;
206 212
207 spin_lock(&sb_lock); 213 if (!sb->s_qcop->get_xquota)
208restart: 214 return -ENOSYS;
209 list_for_each_entry(sb, &super_blocks, s_list) { 215 ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
210 /* This test just improves performance so it needn't be 216 if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
211 * reliable... */ 217 return -EFAULT;
212 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 218 return ret;
213 if (type != -1 && type != cnt)
214 continue;
215 if (!sb_has_quota_active(sb, cnt))
216 continue;
217 if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
218 list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
219 continue;
220 break;
221 }
222 if (cnt == MAXQUOTAS)
223 continue;
224 sb->s_count++;
225 spin_unlock(&sb_lock);
226 down_read(&sb->s_umount);
227 if (sb->s_root)
228 sync_quota_sb(sb, type);
229 up_read(&sb->s_umount);
230 spin_lock(&sb_lock);
231 if (__put_super_and_need_restart(sb))
232 goto restart;
233 }
234 spin_unlock(&sb_lock);
235} 219}
236 220
237/* Copy parameters and call proper function */ 221/* Copy parameters and call proper function */
@@ -240,117 +224,55 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
240{ 224{
241 int ret; 225 int ret;
242 226
227 if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS))
228 return -EINVAL;
229 if (!sb->s_qcop)
230 return -ENOSYS;
231
232 ret = check_quotactl_permission(sb, type, cmd, id);
233 if (ret < 0)
234 return ret;
235
243 switch (cmd) { 236 switch (cmd) {
244 case Q_QUOTAON: { 237 case Q_QUOTAON:
245 char *pathname; 238 return quota_quotaon(sb, type, cmd, id, addr);
246 239 case Q_QUOTAOFF:
247 pathname = getname(addr); 240 if (!sb->s_qcop->quota_off)
248 if (IS_ERR(pathname)) 241 return -ENOSYS;
249 return PTR_ERR(pathname); 242 return sb->s_qcop->quota_off(sb, type, 0);
250 ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); 243 case Q_GETFMT:
251 putname(pathname); 244 return quota_getfmt(sb, type, addr);
252 return ret; 245 case Q_GETINFO:
253 } 246 return quota_getinfo(sb, type, addr);
254 case Q_QUOTAOFF: 247 case Q_SETINFO:
255 return sb->s_qcop->quota_off(sb, type, 0); 248 return quota_setinfo(sb, type, addr);
256 249 case Q_GETQUOTA:
257 case Q_GETFMT: { 250 return quota_getquota(sb, type, id, addr);
258 __u32 fmt; 251 case Q_SETQUOTA:
259 252 return quota_setquota(sb, type, id, addr);
260 down_read(&sb_dqopt(sb)->dqptr_sem); 253 case Q_SYNC:
261 if (!sb_has_quota_active(sb, type)) { 254 if (!sb->s_qcop->quota_sync)
262 up_read(&sb_dqopt(sb)->dqptr_sem); 255 return -ENOSYS;
263 return -ESRCH; 256 return sb->s_qcop->quota_sync(sb, type, 1);
264 } 257 case Q_XQUOTAON:
265 fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; 258 case Q_XQUOTAOFF:
266 up_read(&sb_dqopt(sb)->dqptr_sem); 259 case Q_XQUOTARM:
267 if (copy_to_user(addr, &fmt, sizeof(fmt))) 260 return quota_setxstate(sb, cmd, addr);
268 return -EFAULT; 261 case Q_XGETQSTAT:
269 return 0; 262 return quota_getxstate(sb, addr);
270 } 263 case Q_XSETQLIM:
271 case Q_GETINFO: { 264 return quota_setxquota(sb, type, id, addr);
272 struct if_dqinfo info; 265 case Q_XGETQUOTA:
273 266 return quota_getxquota(sb, type, id, addr);
274 ret = sb->s_qcop->get_info(sb, type, &info); 267 case Q_XQUOTASYNC:
275 if (ret) 268 /* caller already holds s_umount */
276 return ret; 269 if (sb->s_flags & MS_RDONLY)
277 if (copy_to_user(addr, &info, sizeof(info))) 270 return -EROFS;
278 return -EFAULT; 271 writeback_inodes_sb(sb);
279 return 0; 272 return 0;
280 } 273 default:
281 case Q_SETINFO: { 274 return -EINVAL;
282 struct if_dqinfo info;
283
284 if (copy_from_user(&info, addr, sizeof(info)))
285 return -EFAULT;
286 return sb->s_qcop->set_info(sb, type, &info);
287 }
288 case Q_GETQUOTA: {
289 struct if_dqblk idq;
290
291 ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
292 if (ret)
293 return ret;
294 if (copy_to_user(addr, &idq, sizeof(idq)))
295 return -EFAULT;
296 return 0;
297 }
298 case Q_SETQUOTA: {
299 struct if_dqblk idq;
300
301 if (copy_from_user(&idq, addr, sizeof(idq)))
302 return -EFAULT;
303 return sb->s_qcop->set_dqblk(sb, type, id, &idq);
304 }
305 case Q_SYNC:
306 if (sb)
307 sync_quota_sb(sb, type);
308 else
309 sync_dquots(type);
310 return 0;
311
312 case Q_XQUOTAON:
313 case Q_XQUOTAOFF:
314 case Q_XQUOTARM: {
315 __u32 flags;
316
317 if (copy_from_user(&flags, addr, sizeof(flags)))
318 return -EFAULT;
319 return sb->s_qcop->set_xstate(sb, flags, cmd);
320 }
321 case Q_XGETQSTAT: {
322 struct fs_quota_stat fqs;
323
324 if ((ret = sb->s_qcop->get_xstate(sb, &fqs)))
325 return ret;
326 if (copy_to_user(addr, &fqs, sizeof(fqs)))
327 return -EFAULT;
328 return 0;
329 }
330 case Q_XSETQLIM: {
331 struct fs_disk_quota fdq;
332
333 if (copy_from_user(&fdq, addr, sizeof(fdq)))
334 return -EFAULT;
335 return sb->s_qcop->set_xquota(sb, type, id, &fdq);
336 }
337 case Q_XGETQUOTA: {
338 struct fs_disk_quota fdq;
339
340 ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
341 if (ret)
342 return ret;
343 if (copy_to_user(addr, &fdq, sizeof(fdq)))
344 return -EFAULT;
345 return 0;
346 }
347 case Q_XQUOTASYNC:
348 return sb->s_qcop->quota_sync(sb, type);
349 /* We never reach here unless validity check is broken */
350 default:
351 BUG();
352 } 275 }
353 return 0;
354} 276}
355 277
356/* 278/*
@@ -397,224 +319,23 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
397 cmds = cmd >> SUBCMDSHIFT; 319 cmds = cmd >> SUBCMDSHIFT;
398 type = cmd & SUBCMDMASK; 320 type = cmd & SUBCMDMASK;
399 321
400 if (cmds != Q_SYNC || special) { 322 /*
401 sb = quotactl_block(special); 323 * As a special case Q_SYNC can be called without a specific device.
402 if (IS_ERR(sb)) 324 * It will iterate all superblocks that have quota enabled and call
403 return PTR_ERR(sb); 325 * the sync action on each of them.
326 */
327 if (!special) {
328 if (cmds == Q_SYNC)
329 return quota_sync_all(type);
330 return -ENODEV;
404 } 331 }
405 332
406 ret = check_quotactl_valid(sb, type, cmds, id); 333 sb = quotactl_block(special);
407 if (ret >= 0) 334 if (IS_ERR(sb))
408 ret = do_quotactl(sb, type, cmds, id, addr); 335 return PTR_ERR(sb);
409 if (sb)
410 drop_super(sb);
411 336
412 return ret; 337 ret = do_quotactl(sb, type, cmds, id, addr);
413}
414
415#if defined(CONFIG_COMPAT_FOR_U64_ALIGNMENT)
416/*
417 * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
418 * and is necessary due to alignment problems.
419 */
420struct compat_if_dqblk {
421 compat_u64 dqb_bhardlimit;
422 compat_u64 dqb_bsoftlimit;
423 compat_u64 dqb_curspace;
424 compat_u64 dqb_ihardlimit;
425 compat_u64 dqb_isoftlimit;
426 compat_u64 dqb_curinodes;
427 compat_u64 dqb_btime;
428 compat_u64 dqb_itime;
429 compat_uint_t dqb_valid;
430};
431
432/* XFS structures */
433struct compat_fs_qfilestat {
434 compat_u64 dqb_bhardlimit;
435 compat_u64 qfs_nblks;
436 compat_uint_t qfs_nextents;
437};
438
439struct compat_fs_quota_stat {
440 __s8 qs_version;
441 __u16 qs_flags;
442 __s8 qs_pad;
443 struct compat_fs_qfilestat qs_uquota;
444 struct compat_fs_qfilestat qs_gquota;
445 compat_uint_t qs_incoredqs;
446 compat_int_t qs_btimelimit;
447 compat_int_t qs_itimelimit;
448 compat_int_t qs_rtbtimelimit;
449 __u16 qs_bwarnlimit;
450 __u16 qs_iwarnlimit;
451};
452
453asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
454 qid_t id, void __user *addr)
455{
456 unsigned int cmds;
457 struct if_dqblk __user *dqblk;
458 struct compat_if_dqblk __user *compat_dqblk;
459 struct fs_quota_stat __user *fsqstat;
460 struct compat_fs_quota_stat __user *compat_fsqstat;
461 compat_uint_t data;
462 u16 xdata;
463 long ret;
464 338
465 cmds = cmd >> SUBCMDSHIFT; 339 drop_super(sb);
466
467 switch (cmds) {
468 case Q_GETQUOTA:
469 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
470 compat_dqblk = addr;
471 ret = sys_quotactl(cmd, special, id, dqblk);
472 if (ret)
473 break;
474 if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
475 get_user(data, &dqblk->dqb_valid) ||
476 put_user(data, &compat_dqblk->dqb_valid))
477 ret = -EFAULT;
478 break;
479 case Q_SETQUOTA:
480 dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
481 compat_dqblk = addr;
482 ret = -EFAULT;
483 if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
484 get_user(data, &compat_dqblk->dqb_valid) ||
485 put_user(data, &dqblk->dqb_valid))
486 break;
487 ret = sys_quotactl(cmd, special, id, dqblk);
488 break;
489 case Q_XGETQSTAT:
490 fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
491 compat_fsqstat = addr;
492 ret = sys_quotactl(cmd, special, id, fsqstat);
493 if (ret)
494 break;
495 ret = -EFAULT;
496 /* Copying qs_version, qs_flags, qs_pad */
497 if (copy_in_user(compat_fsqstat, fsqstat,
498 offsetof(struct compat_fs_quota_stat, qs_uquota)))
499 break;
500 /* Copying qs_uquota */
501 if (copy_in_user(&compat_fsqstat->qs_uquota,
502 &fsqstat->qs_uquota,
503 sizeof(compat_fsqstat->qs_uquota)) ||
504 get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
505 put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
506 break;
507 /* Copying qs_gquota */
508 if (copy_in_user(&compat_fsqstat->qs_gquota,
509 &fsqstat->qs_gquota,
510 sizeof(compat_fsqstat->qs_gquota)) ||
511 get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
512 put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
513 break;
514 /* Copying the rest */
515 if (copy_in_user(&compat_fsqstat->qs_incoredqs,
516 &fsqstat->qs_incoredqs,
517 sizeof(struct compat_fs_quota_stat) -
518 offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
519 get_user(xdata, &fsqstat->qs_iwarnlimit) ||
520 put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
521 break;
522 ret = 0;
523 break;
524 default:
525 ret = sys_quotactl(cmd, special, id, addr);
526 }
527 return ret; 340 return ret;
528} 341}
529#endif
530
531
532#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
533
534/* Netlink family structure for quota */
535static struct genl_family quota_genl_family = {
536 .id = GENL_ID_GENERATE,
537 .hdrsize = 0,
538 .name = "VFS_DQUOT",
539 .version = 1,
540 .maxattr = QUOTA_NL_A_MAX,
541};
542
543/**
544 * quota_send_warning - Send warning to userspace about exceeded quota
545 * @type: The quota type: USRQQUOTA, GRPQUOTA,...
546 * @id: The user or group id of the quota that was exceeded
547 * @dev: The device on which the fs is mounted (sb->s_dev)
548 * @warntype: The type of the warning: QUOTA_NL_...
549 *
550 * This can be used by filesystems (including those which don't use
551 * dquot) to send a message to userspace relating to quota limits.
552 *
553 */
554
555void quota_send_warning(short type, unsigned int id, dev_t dev,
556 const char warntype)
557{
558 static atomic_t seq;
559 struct sk_buff *skb;
560 void *msg_head;
561 int ret;
562 int msg_size = 4 * nla_total_size(sizeof(u32)) +
563 2 * nla_total_size(sizeof(u64));
564
565 /* We have to allocate using GFP_NOFS as we are called from a
566 * filesystem performing write and thus further recursion into
567 * the fs to free some data could cause deadlocks. */
568 skb = genlmsg_new(msg_size, GFP_NOFS);
569 if (!skb) {
570 printk(KERN_ERR
571 "VFS: Not enough memory to send quota warning.\n");
572 return;
573 }
574 msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
575 &quota_genl_family, 0, QUOTA_NL_C_WARNING);
576 if (!msg_head) {
577 printk(KERN_ERR
578 "VFS: Cannot store netlink header in quota warning.\n");
579 goto err_out;
580 }
581 ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
582 if (ret)
583 goto attr_err_out;
584 ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
585 if (ret)
586 goto attr_err_out;
587 ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
588 if (ret)
589 goto attr_err_out;
590 ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
591 if (ret)
592 goto attr_err_out;
593 ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
594 if (ret)
595 goto attr_err_out;
596 ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
597 if (ret)
598 goto attr_err_out;
599 genlmsg_end(skb, msg_head);
600
601 genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
602 return;
603attr_err_out:
604 printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
605err_out:
606 kfree_skb(skb);
607}
608EXPORT_SYMBOL(quota_send_warning);
609
610static int __init quota_init(void)
611{
612 if (genl_register_family(&quota_genl_family) != 0)
613 printk(KERN_ERR
614 "VFS: Failed to create quota netlink interface.\n");
615 return 0;
616};
617
618module_init(quota_init);
619#endif
620
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 2efc57173fd7..5ea4ad81a429 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -21,6 +21,7 @@
21#include <linux/pagevec.h> 21#include <linux/pagevec.h>
22#include <linux/mman.h> 22#include <linux/mman.h>
23#include <linux/sched.h> 23#include <linux/sched.h>
24#include <linux/slab.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include "internal.h" 27#include "internal.h"
@@ -123,30 +124,6 @@ add_error:
123 124
124/*****************************************************************************/ 125/*****************************************************************************/
125/* 126/*
126 * check that file shrinkage doesn't leave any VMAs dangling in midair
127 */
128static int ramfs_nommu_check_mappings(struct inode *inode,
129 size_t newsize, size_t size)
130{
131 struct vm_area_struct *vma;
132 struct prio_tree_iter iter;
133
134 /* search for VMAs that fall within the dead zone */
135 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
136 newsize >> PAGE_SHIFT,
137 (size + PAGE_SIZE - 1) >> PAGE_SHIFT
138 ) {
139 /* found one - only interested if it's shared out of the page
140 * cache */
141 if (vma->vm_flags & VM_SHARED)
142 return -ETXTBSY; /* not quite true, but near enough */
143 }
144
145 return 0;
146}
147
148/*****************************************************************************/
149/*
150 * 127 *
151 */ 128 */
152static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) 129static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
@@ -164,7 +141,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
164 141
165 /* check that a decrease in size doesn't cut off any shared mappings */ 142 /* check that a decrease in size doesn't cut off any shared mappings */
166 if (newsize < size) { 143 if (newsize < size) {
167 ret = ramfs_nommu_check_mappings(inode, newsize, size); 144 ret = nommu_shrink_inode_mappings(inode, size, newsize);
168 if (ret < 0) 145 if (ret < 0)
169 return ret; 146 return ret;
170 } 147 }
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a6090aa1a7c1..c94853473ca9 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -35,6 +35,7 @@
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/magic.h> 37#include <linux/magic.h>
38#include <linux/slab.h>
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
39#include "internal.h" 40#include "internal.h"
40 41
diff --git a/fs/read_write.c b/fs/read_write.c
index b7f4a1f94d48..113386d6fd2d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -258,6 +258,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
258 init_sync_kiocb(&kiocb, filp); 258 init_sync_kiocb(&kiocb, filp);
259 kiocb.ki_pos = *ppos; 259 kiocb.ki_pos = *ppos;
260 kiocb.ki_left = len; 260 kiocb.ki_left = len;
261 kiocb.ki_nbytes = len;
261 262
262 for (;;) { 263 for (;;) {
263 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); 264 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -313,6 +314,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
313 init_sync_kiocb(&kiocb, filp); 314 init_sync_kiocb(&kiocb, filp);
314 kiocb.ki_pos = *ppos; 315 kiocb.ki_pos = *ppos;
315 kiocb.ki_left = len; 316 kiocb.ki_left = len;
317 kiocb.ki_nbytes = len;
316 318
317 for (;;) { 319 for (;;) {
318 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); 320 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 685495707181..483442e66ed6 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -169,7 +169,7 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
169 return 0; // No free blocks in this bitmap 169 return 0; // No free blocks in this bitmap
170 } 170 }
171 171
172 /* search for a first zero bit -- beggining of a window */ 172 /* search for a first zero bit -- beginning of a window */
173 *beg = reiserfs_find_next_zero_le_bit 173 *beg = reiserfs_find_next_zero_le_bit
174 ((unsigned long *)(bh->b_data), boundary, *beg); 174 ((unsigned long *)(bh->b_data), boundary, *beg);
175 175
@@ -425,7 +425,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
425 425
426 journal_mark_dirty(th, s, sbh); 426 journal_mark_dirty(th, s, sbh);
427 if (for_unformatted) 427 if (for_unformatted)
428 vfs_dq_free_block_nodirty(inode, 1); 428 dquot_free_block_nodirty(inode, 1);
429} 429}
430 430
431void reiserfs_free_block(struct reiserfs_transaction_handle *th, 431void reiserfs_free_block(struct reiserfs_transaction_handle *th,
@@ -1049,7 +1049,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1049 amount_needed, hint->inode->i_uid); 1049 amount_needed, hint->inode->i_uid);
1050#endif 1050#endif
1051 quota_ret = 1051 quota_ret =
1052 vfs_dq_alloc_block_nodirty(hint->inode, amount_needed); 1052 dquot_alloc_block_nodirty(hint->inode, amount_needed);
1053 if (quota_ret) /* Quota exceeded? */ 1053 if (quota_ret) /* Quota exceeded? */
1054 return QUOTA_EXCEEDED; 1054 return QUOTA_EXCEEDED;
1055 if (hint->preallocate && hint->prealloc_size) { 1055 if (hint->preallocate && hint->prealloc_size) {
@@ -1058,7 +1058,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1058 "reiserquota: allocating (prealloc) %d blocks id=%u", 1058 "reiserquota: allocating (prealloc) %d blocks id=%u",
1059 hint->prealloc_size, hint->inode->i_uid); 1059 hint->prealloc_size, hint->inode->i_uid);
1060#endif 1060#endif
1061 quota_ret = vfs_dq_prealloc_block_nodirty(hint->inode, 1061 quota_ret = dquot_prealloc_block_nodirty(hint->inode,
1062 hint->prealloc_size); 1062 hint->prealloc_size);
1063 if (quota_ret) 1063 if (quota_ret)
1064 hint->preallocate = hint->prealloc_size = 0; 1064 hint->preallocate = hint->prealloc_size = 0;
@@ -1092,7 +1092,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1092 hint->inode->i_uid); 1092 hint->inode->i_uid);
1093#endif 1093#endif
1094 /* Free not allocated blocks */ 1094 /* Free not allocated blocks */
1095 vfs_dq_free_block_nodirty(hint->inode, 1095 dquot_free_block_nodirty(hint->inode,
1096 amount_needed + hint->prealloc_size - 1096 amount_needed + hint->prealloc_size -
1097 nr_allocated); 1097 nr_allocated);
1098 } 1098 }
@@ -1125,7 +1125,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
1125 REISERFS_I(hint->inode)->i_prealloc_count, 1125 REISERFS_I(hint->inode)->i_prealloc_count,
1126 hint->inode->i_uid); 1126 hint->inode->i_uid);
1127#endif 1127#endif
1128 vfs_dq_free_block_nodirty(hint->inode, amount_needed + 1128 dquot_free_block_nodirty(hint->inode, amount_needed +
1129 hint->prealloc_size - nr_allocated - 1129 hint->prealloc_size - nr_allocated -
1130 REISERFS_I(hint->inode)-> 1130 REISERFS_I(hint->inode)->
1131 i_prealloc_count); 1131 i_prealloc_count);
@@ -1277,7 +1277,10 @@ int reiserfs_init_bitmap_cache(struct super_block *sb)
1277 struct reiserfs_bitmap_info *bitmap; 1277 struct reiserfs_bitmap_info *bitmap;
1278 unsigned int bmap_nr = reiserfs_bmap_count(sb); 1278 unsigned int bmap_nr = reiserfs_bmap_count(sb);
1279 1279
1280 /* Avoid lock recursion in fault case */
1281 reiserfs_write_unlock(sb);
1280 bitmap = vmalloc(sizeof(*bitmap) * bmap_nr); 1282 bitmap = vmalloc(sizeof(*bitmap) * bmap_nr);
1283 reiserfs_write_lock(sb);
1281 if (bitmap == NULL) 1284 if (bitmap == NULL)
1282 return -ENOMEM; 1285 return -ENOMEM;
1283 1286
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index c094f58c7448..07930449a958 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -8,6 +8,7 @@
8#include <linux/reiserfs_fs.h> 8#include <linux/reiserfs_fs.h>
9#include <linux/stat.h> 9#include <linux/stat.h>
10#include <linux/buffer_head.h> 10#include <linux/buffer_head.h>
11#include <linux/slab.h>
11#include <asm/uaccess.h> 12#include <asm/uaccess.h>
12 13
13extern const struct reiserfs_key MIN_KEY; 14extern const struct reiserfs_key MIN_KEY;
@@ -45,8 +46,6 @@ static inline bool is_privroot_deh(struct dentry *dir,
45 struct reiserfs_de_head *deh) 46 struct reiserfs_de_head *deh)
46{ 47{
47 struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root; 48 struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
48 if (reiserfs_expose_privroot(dir->d_sb))
49 return 0;
50 return (dir == dir->d_parent && privroot->d_inode && 49 return (dir == dir->d_parent && privroot->d_inode &&
51 deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); 50 deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
52} 51}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index da2dba082e2d..1d9c12714c5c 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -289,7 +289,7 @@ const struct file_operations reiserfs_file_operations = {
289 .compat_ioctl = reiserfs_compat_ioctl, 289 .compat_ioctl = reiserfs_compat_ioctl,
290#endif 290#endif
291 .mmap = reiserfs_file_mmap, 291 .mmap = reiserfs_file_mmap,
292 .open = generic_file_open, 292 .open = dquot_file_open,
293 .release = reiserfs_file_release, 293 .release = reiserfs_file_release,
294 .fsync = reiserfs_sync_file, 294 .fsync = reiserfs_sync_file,
295 .aio_read = generic_file_aio_read, 295 .aio_read = generic_file_aio_read,
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 6591cb21edf6..1e4250bc3a6f 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -35,6 +35,7 @@
35 **/ 35 **/
36 36
37#include <linux/time.h> 37#include <linux/time.h>
38#include <linux/slab.h>
38#include <linux/string.h> 39#include <linux/string.h>
39#include <linux/reiserfs_fs.h> 40#include <linux/reiserfs_fs.h>
40#include <linux/buffer_head.h> 41#include <linux/buffer_head.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 290ae38fca8a..dc2c65e04853 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -11,6 +11,7 @@
11#include <linux/smp_lock.h> 11#include <linux/smp_lock.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/highmem.h> 13#include <linux/highmem.h>
14#include <linux/slab.h>
14#include <asm/uaccess.h> 15#include <asm/uaccess.h>
15#include <asm/unaligned.h> 16#include <asm/unaligned.h>
16#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
@@ -31,11 +32,15 @@ void reiserfs_delete_inode(struct inode *inode)
31 JOURNAL_PER_BALANCE_CNT * 2 + 32 JOURNAL_PER_BALANCE_CNT * 2 +
32 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb); 33 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
33 struct reiserfs_transaction_handle th; 34 struct reiserfs_transaction_handle th;
35 int depth;
34 int err; 36 int err;
35 37
38 if (!is_bad_inode(inode))
39 dquot_initialize(inode);
40
36 truncate_inode_pages(&inode->i_data, 0); 41 truncate_inode_pages(&inode->i_data, 0);
37 42
38 reiserfs_write_lock(inode->i_sb); 43 depth = reiserfs_write_lock_once(inode->i_sb);
39 44
40 /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ 45 /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
41 if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ 46 if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
@@ -53,7 +58,7 @@ void reiserfs_delete_inode(struct inode *inode)
53 * after delete_object so that quota updates go into the same transaction as 58 * after delete_object so that quota updates go into the same transaction as
54 * stat data deletion */ 59 * stat data deletion */
55 if (!err) 60 if (!err)
56 vfs_dq_free_inode(inode); 61 dquot_free_inode(inode);
57 62
58 if (journal_end(&th, inode->i_sb, jbegin_count)) 63 if (journal_end(&th, inode->i_sb, jbegin_count))
59 goto out; 64 goto out;
@@ -74,7 +79,7 @@ void reiserfs_delete_inode(struct inode *inode)
74 out: 79 out:
75 clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */ 80 clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */
76 inode->i_blocks = 0; 81 inode->i_blocks = 0;
77 reiserfs_write_unlock(inode->i_sb); 82 reiserfs_write_unlock_once(inode->i_sb, depth);
78} 83}
79 84
80static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid, 85static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
@@ -1496,9 +1501,11 @@ struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
1496 1501
1497 args.objectid = key->on_disk_key.k_objectid; 1502 args.objectid = key->on_disk_key.k_objectid;
1498 args.dirid = key->on_disk_key.k_dir_id; 1503 args.dirid = key->on_disk_key.k_dir_id;
1504 reiserfs_write_unlock(s);
1499 inode = iget5_locked(s, key->on_disk_key.k_objectid, 1505 inode = iget5_locked(s, key->on_disk_key.k_objectid,
1500 reiserfs_find_actor, reiserfs_init_locked_inode, 1506 reiserfs_find_actor, reiserfs_init_locked_inode,
1501 (void *)(&args)); 1507 (void *)(&args));
1508 reiserfs_write_lock(s);
1502 if (!inode) 1509 if (!inode)
1503 return ERR_PTR(-ENOMEM); 1510 return ERR_PTR(-ENOMEM);
1504 1511
@@ -1612,7 +1619,7 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
1612** to properly mark inodes for datasync and such, but only actually 1619** to properly mark inodes for datasync and such, but only actually
1613** does something when called for a synchronous update. 1620** does something when called for a synchronous update.
1614*/ 1621*/
1615int reiserfs_write_inode(struct inode *inode, int do_sync) 1622int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1616{ 1623{
1617 struct reiserfs_transaction_handle th; 1624 struct reiserfs_transaction_handle th;
1618 int jbegin_count = 1; 1625 int jbegin_count = 1;
@@ -1624,7 +1631,7 @@ int reiserfs_write_inode(struct inode *inode, int do_sync)
1624 ** inode needs to reach disk for safety, and they can safely be 1631 ** inode needs to reach disk for safety, and they can safely be
1625 ** ignored because the altered inode has already been logged. 1632 ** ignored because the altered inode has already been logged.
1626 */ 1633 */
1627 if (do_sync && !(current->flags & PF_MEMALLOC)) { 1634 if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
1628 reiserfs_write_lock(inode->i_sb); 1635 reiserfs_write_lock(inode->i_sb);
1629 if (!journal_begin(&th, inode->i_sb, jbegin_count)) { 1636 if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
1630 reiserfs_update_sd(&th, inode); 1637 reiserfs_update_sd(&th, inode);
@@ -1762,10 +1769,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1762 1769
1763 BUG_ON(!th->t_trans_id); 1770 BUG_ON(!th->t_trans_id);
1764 1771
1765 if (vfs_dq_alloc_inode(inode)) { 1772 dquot_initialize(inode);
1766 err = -EDQUOT; 1773 err = dquot_alloc_inode(inode);
1774 if (err)
1767 goto out_end_trans; 1775 goto out_end_trans;
1768 }
1769 if (!dir->i_nlink) { 1776 if (!dir->i_nlink) {
1770 err = -EPERM; 1777 err = -EPERM;
1771 goto out_bad_inode; 1778 goto out_bad_inode;
@@ -1956,12 +1963,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1956 INODE_PKEY(inode)->k_objectid = 0; 1963 INODE_PKEY(inode)->k_objectid = 0;
1957 1964
1958 /* Quota change must be inside a transaction for journaling */ 1965 /* Quota change must be inside a transaction for journaling */
1959 vfs_dq_free_inode(inode); 1966 dquot_free_inode(inode);
1960 1967
1961 out_end_trans: 1968 out_end_trans:
1962 journal_end(th, th->t_super, th->t_blocks_allocated); 1969 journal_end(th, th->t_super, th->t_blocks_allocated);
1963 /* Drop can be outside and it needs more credits so it's better to have it outside */ 1970 /* Drop can be outside and it needs more credits so it's better to have it outside */
1964 vfs_dq_drop(inode); 1971 dquot_drop(inode);
1965 inode->i_flags |= S_NOQUOTA; 1972 inode->i_flags |= S_NOQUOTA;
1966 make_bad_inode(inode); 1973 make_bad_inode(inode);
1967 1974
@@ -3061,14 +3068,17 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
3061int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) 3068int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3062{ 3069{
3063 struct inode *inode = dentry->d_inode; 3070 struct inode *inode = dentry->d_inode;
3064 int error;
3065 unsigned int ia_valid; 3071 unsigned int ia_valid;
3072 int depth;
3073 int error;
3066 3074
3067 /* must be turned off for recursive notify_change calls */ 3075 /* must be turned off for recursive notify_change calls */
3068 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); 3076 ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
3069 3077
3070 reiserfs_write_lock(inode->i_sb); 3078 depth = reiserfs_write_lock_once(inode->i_sb);
3071 if (attr->ia_valid & ATTR_SIZE) { 3079 if (attr->ia_valid & ATTR_SIZE) {
3080 dquot_initialize(inode);
3081
3072 /* version 2 items will be caught by the s_maxbytes check 3082 /* version 2 items will be caught by the s_maxbytes check
3073 ** done for us in vmtruncate 3083 ** done for us in vmtruncate
3074 */ 3084 */
@@ -3130,8 +3140,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3130 jbegin_count); 3140 jbegin_count);
3131 if (error) 3141 if (error)
3132 goto out; 3142 goto out;
3133 error = 3143 error = dquot_transfer(inode, attr);
3134 vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
3135 if (error) { 3144 if (error) {
3136 journal_end(&th, inode->i_sb, 3145 journal_end(&th, inode->i_sb,
3137 jbegin_count); 3146 jbegin_count);
@@ -3148,8 +3157,17 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3148 journal_end(&th, inode->i_sb, jbegin_count); 3157 journal_end(&th, inode->i_sb, jbegin_count);
3149 } 3158 }
3150 } 3159 }
3151 if (!error) 3160 if (!error) {
3161 /*
3162 * Relax the lock here, as it might truncate the
3163 * inode pages and wait for inode pages locks.
3164 * To release such page lock, the owner needs the
3165 * reiserfs lock
3166 */
3167 reiserfs_write_unlock_once(inode->i_sb, depth);
3152 error = inode_setattr(inode, attr); 3168 error = inode_setattr(inode, attr);
3169 depth = reiserfs_write_lock_once(inode->i_sb);
3170 }
3153 } 3171 }
3154 3172
3155 if (!error && reiserfs_posixacl(inode->i_sb)) { 3173 if (!error && reiserfs_posixacl(inode->i_sb)) {
@@ -3158,7 +3176,8 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3158 } 3176 }
3159 3177
3160 out: 3178 out:
3161 reiserfs_write_unlock(inode->i_sb); 3179 reiserfs_write_unlock_once(inode->i_sb, depth);
3180
3162 return error; 3181 return error;
3163} 3182}
3164 3183
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index ace77451ceb1..f53505de0712 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -104,9 +104,10 @@ setflags_out:
104 err = put_user(inode->i_generation, (int __user *)arg); 104 err = put_user(inode->i_generation, (int __user *)arg);
105 break; 105 break;
106 case REISERFS_IOC_SETVERSION: 106 case REISERFS_IOC_SETVERSION:
107 if (!is_owner_or_cap(inode)) 107 if (!is_owner_or_cap(inode)) {
108 err = -EPERM; 108 err = -EPERM;
109 break; 109 break;
110 }
110 err = mnt_want_write(filp->f_path.mnt); 111 err = mnt_want_write(filp->f_path.mnt);
111 if (err) 112 if (err)
112 break; 113 break;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 2f8a7e7b8dab..19fbc810e8e7 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -50,6 +50,7 @@
50#include <linux/blkdev.h> 50#include <linux/blkdev.h>
51#include <linux/backing-dev.h> 51#include <linux/backing-dev.h>
52#include <linux/uaccess.h> 52#include <linux/uaccess.h>
53#include <linux/slab.h>
53 54
54#include <asm/system.h> 55#include <asm/system.h>
55 56
@@ -2009,10 +2010,11 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
2009 destroy_workqueue(commit_wq); 2010 destroy_workqueue(commit_wq);
2010 commit_wq = NULL; 2011 commit_wq = NULL;
2011 } 2012 }
2012 reiserfs_write_lock(sb);
2013 2013
2014 free_journal_ram(sb); 2014 free_journal_ram(sb);
2015 2015
2016 reiserfs_write_lock(sb);
2017
2016 return 0; 2018 return 0;
2017} 2019}
2018 2020
@@ -2216,6 +2218,15 @@ static int journal_read_transaction(struct super_block *sb,
2216 brelse(d_bh); 2218 brelse(d_bh);
2217 return 1; 2219 return 1;
2218 } 2220 }
2221
2222 if (bdev_read_only(sb->s_bdev)) {
2223 reiserfs_warning(sb, "clm-2076",
2224 "device is readonly, unable to replay log");
2225 brelse(c_bh);
2226 brelse(d_bh);
2227 return -EROFS;
2228 }
2229
2219 trans_id = get_desc_trans_id(desc); 2230 trans_id = get_desc_trans_id(desc);
2220 /* now we know we've got a good transaction, and it was inside the valid time ranges */ 2231 /* now we know we've got a good transaction, and it was inside the valid time ranges */
2221 log_blocks = kmalloc(get_desc_trans_len(desc) * 2232 log_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2458,12 +2469,6 @@ static int journal_read(struct super_block *sb)
2458 goto start_log_replay; 2469 goto start_log_replay;
2459 } 2470 }
2460 2471
2461 if (continue_replay && bdev_read_only(sb->s_bdev)) {
2462 reiserfs_warning(sb, "clm-2076",
2463 "device is readonly, unable to replay log");
2464 return -1;
2465 }
2466
2467 /* ok, there are transactions that need to be replayed. start with the first log block, find 2472 /* ok, there are transactions that need to be replayed. start with the first log block, find
2468 ** all the valid transactions, and pick out the oldest. 2473 ** all the valid transactions, and pick out the oldest.
2469 */ 2474 */
@@ -2758,11 +2763,18 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2758 struct reiserfs_journal *journal; 2763 struct reiserfs_journal *journal;
2759 struct reiserfs_journal_list *jl; 2764 struct reiserfs_journal_list *jl;
2760 char b[BDEVNAME_SIZE]; 2765 char b[BDEVNAME_SIZE];
2766 int ret;
2761 2767
2768 /*
2769 * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS
2770 * dependency inversion warnings.
2771 */
2772 reiserfs_write_unlock(sb);
2762 journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal)); 2773 journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal));
2763 if (!journal) { 2774 if (!journal) {
2764 reiserfs_warning(sb, "journal-1256", 2775 reiserfs_warning(sb, "journal-1256",
2765 "unable to get memory for journal structure"); 2776 "unable to get memory for journal structure");
2777 reiserfs_write_lock(sb);
2766 return 1; 2778 return 1;
2767 } 2779 }
2768 memset(journal, 0, sizeof(struct reiserfs_journal)); 2780 memset(journal, 0, sizeof(struct reiserfs_journal));
@@ -2771,10 +2783,12 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2771 INIT_LIST_HEAD(&journal->j_working_list); 2783 INIT_LIST_HEAD(&journal->j_working_list);
2772 INIT_LIST_HEAD(&journal->j_journal_list); 2784 INIT_LIST_HEAD(&journal->j_journal_list);
2773 journal->j_persistent_trans = 0; 2785 journal->j_persistent_trans = 0;
2774 if (reiserfs_allocate_list_bitmaps(sb, 2786 ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
2775 journal->j_list_bitmap, 2787 reiserfs_bmap_count(sb));
2776 reiserfs_bmap_count(sb))) 2788 reiserfs_write_lock(sb);
2789 if (ret)
2777 goto free_and_return; 2790 goto free_and_return;
2791
2778 allocate_bitmap_nodes(sb); 2792 allocate_bitmap_nodes(sb);
2779 2793
2780 /* reserved for journal area support */ 2794 /* reserved for journal area support */
@@ -2903,7 +2917,9 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2903 journal->j_mount_id = 10; 2917 journal->j_mount_id = 10;
2904 journal->j_state = 0; 2918 journal->j_state = 0;
2905 atomic_set(&(journal->j_jlock), 0); 2919 atomic_set(&(journal->j_jlock), 0);
2920 reiserfs_write_unlock(sb);
2906 journal->j_cnode_free_list = allocate_cnodes(num_cnodes); 2921 journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
2922 reiserfs_write_lock(sb);
2907 journal->j_cnode_free_orig = journal->j_cnode_free_list; 2923 journal->j_cnode_free_orig = journal->j_cnode_free_list;
2908 journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; 2924 journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
2909 journal->j_cnode_used = 0; 2925 journal->j_cnode_used = 0;
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
index ee2cfc0fd8a7..b87aa2c1afc1 100644
--- a/fs/reiserfs/lock.c
+++ b/fs/reiserfs/lock.c
@@ -86,3 +86,12 @@ void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
86 reiserfs_panic(sb, "%s called without kernel lock held %d", 86 reiserfs_panic(sb, "%s called without kernel lock held %d",
87 caller); 87 caller);
88} 88}
89
90#ifdef CONFIG_REISERFS_CHECK
91void reiserfs_lock_check_recursive(struct super_block *sb)
92{
93 struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
94
95 WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n");
96}
97#endif
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index e296ff72a6cc..d0c43cb99ffc 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -13,6 +13,7 @@
13 13
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/bitops.h> 15#include <linux/bitops.h>
16#include <linux/slab.h>
16#include <linux/reiserfs_fs.h> 17#include <linux/reiserfs_fs.h>
17#include <linux/reiserfs_acl.h> 18#include <linux/reiserfs_acl.h>
18#include <linux/reiserfs_xattr.h> 19#include <linux/reiserfs_xattr.h>
@@ -546,7 +547,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
546*/ 547*/
547static int drop_new_inode(struct inode *inode) 548static int drop_new_inode(struct inode *inode)
548{ 549{
549 vfs_dq_drop(inode); 550 dquot_drop(inode);
550 make_bad_inode(inode); 551 make_bad_inode(inode);
551 inode->i_flags |= S_NOQUOTA; 552 inode->i_flags |= S_NOQUOTA;
552 iput(inode); 553 iput(inode);
@@ -554,7 +555,7 @@ static int drop_new_inode(struct inode *inode)
554} 555}
555 556
556/* utility function that does setup for reiserfs_new_inode. 557/* utility function that does setup for reiserfs_new_inode.
557** vfs_dq_init needs lots of credits so it's better to have it 558** dquot_initialize needs lots of credits so it's better to have it
558** outside of a transaction, so we had to pull some bits of 559** outside of a transaction, so we had to pull some bits of
559** reiserfs_new_inode out into this func. 560** reiserfs_new_inode out into this func.
560*/ 561*/
@@ -577,7 +578,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
577 } else { 578 } else {
578 inode->i_gid = current_fsgid(); 579 inode->i_gid = current_fsgid();
579 } 580 }
580 vfs_dq_init(inode); 581 dquot_initialize(inode);
581 return 0; 582 return 0;
582} 583}
583 584
@@ -594,6 +595,8 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
594 struct reiserfs_transaction_handle th; 595 struct reiserfs_transaction_handle th;
595 struct reiserfs_security_handle security; 596 struct reiserfs_security_handle security;
596 597
598 dquot_initialize(dir);
599
597 if (!(inode = new_inode(dir->i_sb))) { 600 if (!(inode = new_inode(dir->i_sb))) {
598 return -ENOMEM; 601 return -ENOMEM;
599 } 602 }
@@ -666,6 +669,8 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
666 if (!new_valid_dev(rdev)) 669 if (!new_valid_dev(rdev))
667 return -EINVAL; 670 return -EINVAL;
668 671
672 dquot_initialize(dir);
673
669 if (!(inode = new_inode(dir->i_sb))) { 674 if (!(inode = new_inode(dir->i_sb))) {
670 return -ENOMEM; 675 return -ENOMEM;
671 } 676 }
@@ -739,6 +744,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
739 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + 744 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
740 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); 745 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
741 746
747 dquot_initialize(dir);
748
742#ifdef DISPLACE_NEW_PACKING_LOCALITIES 749#ifdef DISPLACE_NEW_PACKING_LOCALITIES
743 /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */ 750 /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */
744 REISERFS_I(dir)->new_packing_locality = 1; 751 REISERFS_I(dir)->new_packing_locality = 1;
@@ -842,6 +849,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
842 JOURNAL_PER_BALANCE_CNT * 2 + 2 + 849 JOURNAL_PER_BALANCE_CNT * 2 + 2 +
843 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); 850 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
844 851
852 dquot_initialize(dir);
853
845 reiserfs_write_lock(dir->i_sb); 854 reiserfs_write_lock(dir->i_sb);
846 retval = journal_begin(&th, dir->i_sb, jbegin_count); 855 retval = journal_begin(&th, dir->i_sb, jbegin_count);
847 if (retval) 856 if (retval)
@@ -921,6 +930,9 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
921 struct reiserfs_transaction_handle th; 930 struct reiserfs_transaction_handle th;
922 int jbegin_count; 931 int jbegin_count;
923 unsigned long savelink; 932 unsigned long savelink;
933 int depth;
934
935 dquot_initialize(dir);
924 936
925 inode = dentry->d_inode; 937 inode = dentry->d_inode;
926 938
@@ -932,7 +944,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
932 JOURNAL_PER_BALANCE_CNT * 2 + 2 + 944 JOURNAL_PER_BALANCE_CNT * 2 + 2 +
933 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); 945 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
934 946
935 reiserfs_write_lock(dir->i_sb); 947 depth = reiserfs_write_lock_once(dir->i_sb);
936 retval = journal_begin(&th, dir->i_sb, jbegin_count); 948 retval = journal_begin(&th, dir->i_sb, jbegin_count);
937 if (retval) 949 if (retval)
938 goto out_unlink; 950 goto out_unlink;
@@ -993,7 +1005,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
993 1005
994 retval = journal_end(&th, dir->i_sb, jbegin_count); 1006 retval = journal_end(&th, dir->i_sb, jbegin_count);
995 reiserfs_check_path(&path); 1007 reiserfs_check_path(&path);
996 reiserfs_write_unlock(dir->i_sb); 1008 reiserfs_write_unlock_once(dir->i_sb, depth);
997 return retval; 1009 return retval;
998 1010
999 end_unlink: 1011 end_unlink:
@@ -1003,7 +1015,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
1003 if (err) 1015 if (err)
1004 retval = err; 1016 retval = err;
1005 out_unlink: 1017 out_unlink:
1006 reiserfs_write_unlock(dir->i_sb); 1018 reiserfs_write_unlock_once(dir->i_sb, depth);
1007 return retval; 1019 return retval;
1008} 1020}
1009 1021
@@ -1023,6 +1035,8 @@ static int reiserfs_symlink(struct inode *parent_dir,
1023 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + 1035 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
1024 REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb)); 1036 REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
1025 1037
1038 dquot_initialize(parent_dir);
1039
1026 if (!(inode = new_inode(parent_dir->i_sb))) { 1040 if (!(inode = new_inode(parent_dir->i_sb))) {
1027 return -ENOMEM; 1041 return -ENOMEM;
1028 } 1042 }
@@ -1110,6 +1124,8 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
1110 JOURNAL_PER_BALANCE_CNT * 3 + 1124 JOURNAL_PER_BALANCE_CNT * 3 +
1111 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); 1125 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
1112 1126
1127 dquot_initialize(dir);
1128
1113 reiserfs_write_lock(dir->i_sb); 1129 reiserfs_write_lock(dir->i_sb);
1114 if (inode->i_nlink >= REISERFS_LINK_MAX) { 1130 if (inode->i_nlink >= REISERFS_LINK_MAX) {
1115 //FIXME: sd_nlink is 32 bit for new files 1131 //FIXME: sd_nlink is 32 bit for new files
@@ -1234,6 +1250,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1234 JOURNAL_PER_BALANCE_CNT * 3 + 5 + 1250 JOURNAL_PER_BALANCE_CNT * 3 + 5 +
1235 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); 1251 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
1236 1252
1253 dquot_initialize(old_dir);
1254 dquot_initialize(new_dir);
1255
1237 old_inode = old_dentry->d_inode; 1256 old_inode = old_dentry->d_inode;
1238 new_dentry_inode = new_dentry->d_inode; 1257 new_dentry_inode = new_dentry->d_inode;
1239 1258
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 5fa7118f04e1..313d39d639eb 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1299,7 +1299,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
1299 "reiserquota delete_item(): freeing %u, id=%u type=%c", 1299 "reiserquota delete_item(): freeing %u, id=%u type=%c",
1300 quota_cut_bytes, inode->i_uid, head2type(&s_ih)); 1300 quota_cut_bytes, inode->i_uid, head2type(&s_ih));
1301#endif 1301#endif
1302 vfs_dq_free_space_nodirty(inode, quota_cut_bytes); 1302 dquot_free_space_nodirty(inode, quota_cut_bytes);
1303 1303
1304 /* Return deleted body length */ 1304 /* Return deleted body length */
1305 return ret_value; 1305 return ret_value;
@@ -1383,7 +1383,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
1383 quota_cut_bytes, inode->i_uid, 1383 quota_cut_bytes, inode->i_uid,
1384 key2type(key)); 1384 key2type(key));
1385#endif 1385#endif
1386 vfs_dq_free_space_nodirty(inode, 1386 dquot_free_space_nodirty(inode,
1387 quota_cut_bytes); 1387 quota_cut_bytes);
1388 } 1388 }
1389 break; 1389 break;
@@ -1733,7 +1733,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
1733 "reiserquota cut_from_item(): freeing %u id=%u type=%c", 1733 "reiserquota cut_from_item(): freeing %u id=%u type=%c",
1734 quota_cut_bytes, inode->i_uid, '?'); 1734 quota_cut_bytes, inode->i_uid, '?');
1735#endif 1735#endif
1736 vfs_dq_free_space_nodirty(inode, quota_cut_bytes); 1736 dquot_free_space_nodirty(inode, quota_cut_bytes);
1737 return ret_value; 1737 return ret_value;
1738} 1738}
1739 1739
@@ -1968,9 +1968,10 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
1968 key2type(&(key->on_disk_key))); 1968 key2type(&(key->on_disk_key)));
1969#endif 1969#endif
1970 1970
1971 if (vfs_dq_alloc_space_nodirty(inode, pasted_size)) { 1971 retval = dquot_alloc_space_nodirty(inode, pasted_size);
1972 if (retval) {
1972 pathrelse(search_path); 1973 pathrelse(search_path);
1973 return -EDQUOT; 1974 return retval;
1974 } 1975 }
1975 init_tb_struct(th, &s_paste_balance, th->t_super, search_path, 1976 init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
1976 pasted_size); 1977 pasted_size);
@@ -2024,7 +2025,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
2024 pasted_size, inode->i_uid, 2025 pasted_size, inode->i_uid,
2025 key2type(&(key->on_disk_key))); 2026 key2type(&(key->on_disk_key)));
2026#endif 2027#endif
2027 vfs_dq_free_space_nodirty(inode, pasted_size); 2028 dquot_free_space_nodirty(inode, pasted_size);
2028 return retval; 2029 return retval;
2029} 2030}
2030 2031
@@ -2062,9 +2063,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
2062#endif 2063#endif
2063 /* We can't dirty inode here. It would be immediately written but 2064 /* We can't dirty inode here. It would be immediately written but
2064 * appropriate stat item isn't inserted yet... */ 2065 * appropriate stat item isn't inserted yet... */
2065 if (vfs_dq_alloc_space_nodirty(inode, quota_bytes)) { 2066 retval = dquot_alloc_space_nodirty(inode, quota_bytes);
2067 if (retval) {
2066 pathrelse(path); 2068 pathrelse(path);
2067 return -EDQUOT; 2069 return retval;
2068 } 2070 }
2069 } 2071 }
2070 init_tb_struct(th, &s_ins_balance, th->t_super, path, 2072 init_tb_struct(th, &s_ins_balance, th->t_super, path,
@@ -2113,6 +2115,6 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
2113 quota_bytes, inode->i_uid, head2type(ih)); 2115 quota_bytes, inode->i_uid, head2type(ih));
2114#endif 2116#endif
2115 if (inode) 2117 if (inode)
2116 vfs_dq_free_space_nodirty(inode, quota_bytes); 2118 dquot_free_space_nodirty(inode, quota_bytes);
2117 return retval; 2119 return retval;
2118} 2120}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b4a7dd03bdb9..59125fb36d42 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -12,6 +12,7 @@
12 */ 12 */
13 13
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/slab.h>
15#include <linux/vmalloc.h> 16#include <linux/vmalloc.h>
16#include <linux/time.h> 17#include <linux/time.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
@@ -246,7 +247,7 @@ static int finish_unfinished(struct super_block *s)
246 retval = remove_save_link_only(s, &save_link_key, 0); 247 retval = remove_save_link_only(s, &save_link_key, 0);
247 continue; 248 continue;
248 } 249 }
249 vfs_dq_init(inode); 250 dquot_initialize(inode);
250 251
251 if (truncate && S_ISDIR(inode->i_mode)) { 252 if (truncate && S_ISDIR(inode->i_mode)) {
252 /* We got a truncate request for a dir which is impossible. 253 /* We got a truncate request for a dir which is impossible.
@@ -578,6 +579,11 @@ out:
578 reiserfs_write_unlock_once(inode->i_sb, lock_depth); 579 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
579} 580}
580 581
582static void reiserfs_clear_inode(struct inode *inode)
583{
584 dquot_drop(inode);
585}
586
581#ifdef CONFIG_QUOTA 587#ifdef CONFIG_QUOTA
582static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, 588static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
583 size_t, loff_t); 589 size_t, loff_t);
@@ -590,6 +596,7 @@ static const struct super_operations reiserfs_sops = {
590 .destroy_inode = reiserfs_destroy_inode, 596 .destroy_inode = reiserfs_destroy_inode,
591 .write_inode = reiserfs_write_inode, 597 .write_inode = reiserfs_write_inode,
592 .dirty_inode = reiserfs_dirty_inode, 598 .dirty_inode = reiserfs_dirty_inode,
599 .clear_inode = reiserfs_clear_inode,
593 .delete_inode = reiserfs_delete_inode, 600 .delete_inode = reiserfs_delete_inode,
594 .put_super = reiserfs_put_super, 601 .put_super = reiserfs_put_super,
595 .write_super = reiserfs_write_super, 602 .write_super = reiserfs_write_super,
@@ -616,13 +623,6 @@ static int reiserfs_write_info(struct super_block *, int);
616static int reiserfs_quota_on(struct super_block *, int, int, char *, int); 623static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
617 624
618static const struct dquot_operations reiserfs_quota_operations = { 625static const struct dquot_operations reiserfs_quota_operations = {
619 .initialize = dquot_initialize,
620 .drop = dquot_drop,
621 .alloc_space = dquot_alloc_space,
622 .alloc_inode = dquot_alloc_inode,
623 .free_space = dquot_free_space,
624 .free_inode = dquot_free_inode,
625 .transfer = dquot_transfer,
626 .write_dquot = reiserfs_write_dquot, 626 .write_dquot = reiserfs_write_dquot,
627 .acquire_dquot = reiserfs_acquire_dquot, 627 .acquire_dquot = reiserfs_acquire_dquot,
628 .release_dquot = reiserfs_release_dquot, 628 .release_dquot = reiserfs_release_dquot,
@@ -1619,10 +1619,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1619 save_mount_options(s, data); 1619 save_mount_options(s, data);
1620 1620
1621 sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); 1621 sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
1622 if (!sbi) { 1622 if (!sbi)
1623 errval = -ENOMEM; 1623 return -ENOMEM;
1624 goto error_alloc;
1625 }
1626 s->s_fs_info = sbi; 1624 s->s_fs_info = sbi;
1627 /* Set default values for options: non-aggressive tails, RO on errors */ 1625 /* Set default values for options: non-aggressive tails, RO on errors */
1628 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL); 1626 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
@@ -1879,12 +1877,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1879 return (0); 1877 return (0);
1880 1878
1881error: 1879error:
1882 reiserfs_write_unlock(s);
1883error_alloc:
1884 if (jinit_done) { /* kill the commit thread, free journal ram */ 1880 if (jinit_done) { /* kill the commit thread, free journal ram */
1885 journal_release_error(NULL, s); 1881 journal_release_error(NULL, s);
1886 } 1882 }
1887 1883
1884 reiserfs_write_unlock(s);
1885
1888 reiserfs_free_bitmap_cache(s); 1886 reiserfs_free_bitmap_cache(s);
1889 if (SB_BUFFER_WITH_SB(s)) 1887 if (SB_BUFFER_WITH_SB(s))
1890 brelse(SB_BUFFER_WITH_SB(s)); 1888 brelse(SB_BUFFER_WITH_SB(s));
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8c7033a8b67e..e7cc00e636dc 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -38,6 +38,7 @@
38#include <linux/dcache.h> 38#include <linux/dcache.h>
39#include <linux/namei.h> 39#include <linux/namei.h>
40#include <linux/errno.h> 40#include <linux/errno.h>
41#include <linux/gfp.h>
41#include <linux/fs.h> 42#include <linux/fs.h>
42#include <linux/file.h> 43#include <linux/file.h>
43#include <linux/pagemap.h> 44#include <linux/pagemap.h>
@@ -61,7 +62,6 @@
61static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) 62static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
62{ 63{
63 BUG_ON(!mutex_is_locked(&dir->i_mutex)); 64 BUG_ON(!mutex_is_locked(&dir->i_mutex));
64 vfs_dq_init(dir);
65 return dir->i_op->create(dir, dentry, mode, NULL); 65 return dir->i_op->create(dir, dentry, mode, NULL);
66} 66}
67#endif 67#endif
@@ -69,7 +69,6 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
69static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode) 69static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode)
70{ 70{
71 BUG_ON(!mutex_is_locked(&dir->i_mutex)); 71 BUG_ON(!mutex_is_locked(&dir->i_mutex));
72 vfs_dq_init(dir);
73 return dir->i_op->mkdir(dir, dentry, mode); 72 return dir->i_op->mkdir(dir, dentry, mode);
74} 73}
75 74
@@ -81,9 +80,9 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
81{ 80{
82 int error; 81 int error;
83 BUG_ON(!mutex_is_locked(&dir->i_mutex)); 82 BUG_ON(!mutex_is_locked(&dir->i_mutex));
84 vfs_dq_init(dir);
85 83
86 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); 84 reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
85 I_MUTEX_CHILD, dir->i_sb);
87 error = dir->i_op->unlink(dir, dentry); 86 error = dir->i_op->unlink(dir, dentry);
88 mutex_unlock(&dentry->d_inode->i_mutex); 87 mutex_unlock(&dentry->d_inode->i_mutex);
89 88
@@ -96,9 +95,9 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
96{ 95{
97 int error; 96 int error;
98 BUG_ON(!mutex_is_locked(&dir->i_mutex)); 97 BUG_ON(!mutex_is_locked(&dir->i_mutex));
99 vfs_dq_init(dir);
100 98
101 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); 99 reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
100 I_MUTEX_CHILD, dir->i_sb);
102 dentry_unhash(dentry); 101 dentry_unhash(dentry);
103 error = dir->i_op->rmdir(dir, dentry); 102 error = dir->i_op->rmdir(dir, dentry);
104 if (!error) 103 if (!error)
@@ -235,16 +234,22 @@ static int reiserfs_for_each_xattr(struct inode *inode,
235 if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1) 234 if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
236 return 0; 235 return 0;
237 236
237 reiserfs_write_unlock(inode->i_sb);
238 dir = open_xa_dir(inode, XATTR_REPLACE); 238 dir = open_xa_dir(inode, XATTR_REPLACE);
239 if (IS_ERR(dir)) { 239 if (IS_ERR(dir)) {
240 err = PTR_ERR(dir); 240 err = PTR_ERR(dir);
241 reiserfs_write_lock(inode->i_sb);
241 goto out; 242 goto out;
242 } else if (!dir->d_inode) { 243 } else if (!dir->d_inode) {
243 err = 0; 244 err = 0;
245 reiserfs_write_lock(inode->i_sb);
244 goto out_dir; 246 goto out_dir;
245 } 247 }
246 248
247 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); 249 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
250
251 reiserfs_write_lock(inode->i_sb);
252
248 buf.xadir = dir; 253 buf.xadir = dir;
249 err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos); 254 err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos);
250 while ((err == 0 || err == -ENOSPC) && buf.count) { 255 while ((err == 0 || err == -ENOSPC) && buf.count) {
@@ -283,8 +288,9 @@ static int reiserfs_for_each_xattr(struct inode *inode,
283 err = journal_begin(&th, inode->i_sb, blocks); 288 err = journal_begin(&th, inode->i_sb, blocks);
284 if (!err) { 289 if (!err) {
285 int jerror; 290 int jerror;
286 mutex_lock_nested(&dir->d_parent->d_inode->i_mutex, 291 reiserfs_mutex_lock_nested_safe(
287 I_MUTEX_XATTR); 292 &dir->d_parent->d_inode->i_mutex,
293 I_MUTEX_XATTR, inode->i_sb);
288 err = action(dir, data); 294 err = action(dir, data);
289 jerror = journal_end(&th, inode->i_sb, blocks); 295 jerror = journal_end(&th, inode->i_sb, blocks);
290 mutex_unlock(&dir->d_parent->d_inode->i_mutex); 296 mutex_unlock(&dir->d_parent->d_inode->i_mutex);
@@ -443,7 +449,9 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
443 } 449 }
444 450
445 if (dentry->d_inode) { 451 if (dentry->d_inode) {
452 reiserfs_write_lock(inode->i_sb);
446 err = xattr_unlink(xadir->d_inode, dentry); 453 err = xattr_unlink(xadir->d_inode, dentry);
454 reiserfs_write_unlock(inode->i_sb);
447 update_ctime(inode); 455 update_ctime(inode);
448 } 456 }
449 457
@@ -477,15 +485,24 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
477 if (get_inode_sd_version(inode) == STAT_DATA_V1) 485 if (get_inode_sd_version(inode) == STAT_DATA_V1)
478 return -EOPNOTSUPP; 486 return -EOPNOTSUPP;
479 487
480 if (!buffer) 488 reiserfs_write_unlock(inode->i_sb);
481 return lookup_and_delete_xattr(inode, name); 489
490 if (!buffer) {
491 err = lookup_and_delete_xattr(inode, name);
492 reiserfs_write_lock(inode->i_sb);
493 return err;
494 }
482 495
483 dentry = xattr_lookup(inode, name, flags); 496 dentry = xattr_lookup(inode, name, flags);
484 if (IS_ERR(dentry)) 497 if (IS_ERR(dentry)) {
498 reiserfs_write_lock(inode->i_sb);
485 return PTR_ERR(dentry); 499 return PTR_ERR(dentry);
500 }
486 501
487 down_write(&REISERFS_I(inode)->i_xattr_sem); 502 down_write(&REISERFS_I(inode)->i_xattr_sem);
488 503
504 reiserfs_write_lock(inode->i_sb);
505
489 xahash = xattr_hash(buffer, buffer_size); 506 xahash = xattr_hash(buffer, buffer_size);
490 while (buffer_pos < buffer_size || buffer_pos == 0) { 507 while (buffer_pos < buffer_size || buffer_pos == 0) {
491 size_t chunk; 508 size_t chunk;
@@ -537,11 +554,15 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
537 if (!err && new_size < i_size_read(dentry->d_inode)) { 554 if (!err && new_size < i_size_read(dentry->d_inode)) {
538 struct iattr newattrs = { 555 struct iattr newattrs = {
539 .ia_ctime = current_fs_time(inode->i_sb), 556 .ia_ctime = current_fs_time(inode->i_sb),
540 .ia_size = buffer_size, 557 .ia_size = new_size,
541 .ia_valid = ATTR_SIZE | ATTR_CTIME, 558 .ia_valid = ATTR_SIZE | ATTR_CTIME,
542 }; 559 };
560
561 reiserfs_write_unlock(inode->i_sb);
543 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); 562 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
544 down_write(&dentry->d_inode->i_alloc_sem); 563 down_write(&dentry->d_inode->i_alloc_sem);
564 reiserfs_write_lock(inode->i_sb);
565
545 err = reiserfs_setattr(dentry, &newattrs); 566 err = reiserfs_setattr(dentry, &newattrs);
546 up_write(&dentry->d_inode->i_alloc_sem); 567 up_write(&dentry->d_inode->i_alloc_sem);
547 mutex_unlock(&dentry->d_inode->i_mutex); 568 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -952,21 +973,13 @@ int reiserfs_permission(struct inode *inode, int mask)
952 return generic_permission(inode, mask, NULL); 973 return generic_permission(inode, mask, NULL);
953} 974}
954 975
955/* This will catch lookups from the fs root to .reiserfs_priv */ 976static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
956static int
957xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
958{ 977{
959 struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root; 978 return -EPERM;
960 if (container_of(q1, struct dentry, d_name) == priv_root)
961 return -ENOENT;
962 if (q1->len == name->len &&
963 !memcmp(q1->name, name->name, name->len))
964 return 0;
965 return 1;
966} 979}
967 980
968static const struct dentry_operations xattr_lookup_poison_ops = { 981static const struct dentry_operations xattr_lookup_poison_ops = {
969 .d_compare = xattr_lookup_poison, 982 .d_revalidate = xattr_hide_revalidate,
970}; 983};
971 984
972int reiserfs_lookup_privroot(struct super_block *s) 985int reiserfs_lookup_privroot(struct super_block *s)
@@ -980,8 +993,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
980 strlen(PRIVROOT_NAME)); 993 strlen(PRIVROOT_NAME));
981 if (!IS_ERR(dentry)) { 994 if (!IS_ERR(dentry)) {
982 REISERFS_SB(s)->priv_root = dentry; 995 REISERFS_SB(s)->priv_root = dentry;
983 if (!reiserfs_expose_privroot(s)) 996 dentry->d_op = &xattr_lookup_poison_ops;
984 s->s_root->d_op = &xattr_lookup_poison_ops;
985 if (dentry->d_inode) 997 if (dentry->d_inode)
986 dentry->d_inode->i_flags |= S_PRIVATE; 998 dentry->d_inode->i_flags |= S_PRIVATE;
987 } else 999 } else
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index cc32e6ada67b..9cdb759645a9 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -5,6 +5,7 @@
5#include <linux/errno.h> 5#include <linux/errno.h>
6#include <linux/pagemap.h> 6#include <linux/pagemap.h>
7#include <linux/xattr.h> 7#include <linux/xattr.h>
8#include <linux/slab.h>
8#include <linux/posix_acl_xattr.h> 9#include <linux/posix_acl_xattr.h>
9#include <linux/reiserfs_xattr.h> 10#include <linux/reiserfs_xattr.h>
10#include <linux/reiserfs_acl.h> 11#include <linux/reiserfs_acl.h>
@@ -455,7 +456,9 @@ int reiserfs_acl_chmod(struct inode *inode)
455 return 0; 456 return 0;
456 } 457 }
457 458
459 reiserfs_write_unlock(inode->i_sb);
458 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); 460 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
461 reiserfs_write_lock(inode->i_sb);
459 if (!acl) 462 if (!acl)
460 return 0; 463 return 0;
461 if (IS_ERR(acl)) 464 if (IS_ERR(acl))
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index d8b5bfcbdd30..7271a477c041 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -3,6 +3,7 @@
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/pagemap.h> 4#include <linux/pagemap.h>
5#include <linux/xattr.h> 5#include <linux/xattr.h>
6#include <linux/slab.h>
6#include <linux/reiserfs_xattr.h> 7#include <linux/reiserfs_xattr.h>
7#include <linux/security.h> 8#include <linux/security.h>
8#include <asm/uaccess.h> 9#include <asm/uaccess.h>
@@ -76,7 +77,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
76 return error; 77 return error;
77 } 78 }
78 79
79 if (sec->length) { 80 if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
80 blocks = reiserfs_xattr_jcreate_nblocks(inode) + 81 blocks = reiserfs_xattr_jcreate_nblocks(inode) +
81 reiserfs_xattr_nblocks(inode, sec->length); 82 reiserfs_xattr_nblocks(inode, sec->length);
82 /* We don't want to count the directories twice if we have 83 /* We don't want to count the directories twice if we have
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index c117fa80d1e9..42d213546894 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -544,6 +544,7 @@ error:
544error_rsb_inval: 544error_rsb_inval:
545 ret = -EINVAL; 545 ret = -EINVAL;
546error_rsb: 546error_rsb:
547 kfree(rsb);
547 return ret; 548 return ret;
548} 549}
549 550
diff --git a/fs/select.c b/fs/select.c
index fd38ce2e32e3..500a669f7790 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -691,6 +691,23 @@ SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
691} 691}
692#endif /* HAVE_SET_RESTORE_SIGMASK */ 692#endif /* HAVE_SET_RESTORE_SIGMASK */
693 693
694#ifdef __ARCH_WANT_SYS_OLD_SELECT
695struct sel_arg_struct {
696 unsigned long n;
697 fd_set __user *inp, *outp, *exp;
698 struct timeval __user *tvp;
699};
700
701SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
702{
703 struct sel_arg_struct a;
704
705 if (copy_from_user(&a, arg, sizeof(a)))
706 return -EFAULT;
707 return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
708}
709#endif
710
694struct poll_list { 711struct poll_list {
695 struct poll_list *next; 712 struct poll_list *next;
696 int len; 713 int len;
@@ -821,7 +838,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
821 struct poll_list *walk = head; 838 struct poll_list *walk = head;
822 unsigned long todo = nfds; 839 unsigned long todo = nfds;
823 840
824 if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur) 841 if (nfds > rlimit(RLIMIT_NOFILE))
825 return -EINVAL; 842 return -EINVAL;
826 843
827 len = min_t(unsigned int, nfds, N_STACK_PPS); 844 len = min_t(unsigned int, nfds, N_STACK_PPS);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index eae7d9dbf3ff..e1f437be6c3c 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -674,7 +674,6 @@ struct list_head *seq_list_start(struct list_head *head, loff_t pos)
674 674
675 return NULL; 675 return NULL;
676} 676}
677
678EXPORT_SYMBOL(seq_list_start); 677EXPORT_SYMBOL(seq_list_start);
679 678
680struct list_head *seq_list_start_head(struct list_head *head, loff_t pos) 679struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
@@ -684,7 +683,6 @@ struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
684 683
685 return seq_list_start(head, pos - 1); 684 return seq_list_start(head, pos - 1);
686} 685}
687
688EXPORT_SYMBOL(seq_list_start_head); 686EXPORT_SYMBOL(seq_list_start_head);
689 687
690struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos) 688struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
@@ -695,5 +693,131 @@ struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
695 ++*ppos; 693 ++*ppos;
696 return lh == head ? NULL : lh; 694 return lh == head ? NULL : lh;
697} 695}
698
699EXPORT_SYMBOL(seq_list_next); 696EXPORT_SYMBOL(seq_list_next);
697
698/**
699 * seq_hlist_start - start an iteration of a hlist
700 * @head: the head of the hlist
701 * @pos: the start position of the sequence
702 *
703 * Called at seq_file->op->start().
704 */
705struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos)
706{
707 struct hlist_node *node;
708
709 hlist_for_each(node, head)
710 if (pos-- == 0)
711 return node;
712 return NULL;
713}
714EXPORT_SYMBOL(seq_hlist_start);
715
716/**
717 * seq_hlist_start_head - start an iteration of a hlist
718 * @head: the head of the hlist
719 * @pos: the start position of the sequence
720 *
721 * Called at seq_file->op->start(). Call this function if you want to
722 * print a header at the top of the output.
723 */
724struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos)
725{
726 if (!pos)
727 return SEQ_START_TOKEN;
728
729 return seq_hlist_start(head, pos - 1);
730}
731EXPORT_SYMBOL(seq_hlist_start_head);
732
733/**
734 * seq_hlist_next - move to the next position of the hlist
735 * @v: the current iterator
736 * @head: the head of the hlist
737 * @ppos: the current position
738 *
739 * Called at seq_file->op->next().
740 */
741struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
742 loff_t *ppos)
743{
744 struct hlist_node *node = v;
745
746 ++*ppos;
747 if (v == SEQ_START_TOKEN)
748 return head->first;
749 else
750 return node->next;
751}
752EXPORT_SYMBOL(seq_hlist_next);
753
754/**
755 * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU
756 * @head: the head of the hlist
757 * @pos: the start position of the sequence
758 *
759 * Called at seq_file->op->start().
760 *
761 * This list-traversal primitive may safely run concurrently with
762 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
763 * as long as the traversal is guarded by rcu_read_lock().
764 */
765struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head,
766 loff_t pos)
767{
768 struct hlist_node *node;
769
770 __hlist_for_each_rcu(node, head)
771 if (pos-- == 0)
772 return node;
773 return NULL;
774}
775EXPORT_SYMBOL(seq_hlist_start_rcu);
776
777/**
778 * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU
779 * @head: the head of the hlist
780 * @pos: the start position of the sequence
781 *
782 * Called at seq_file->op->start(). Call this function if you want to
783 * print a header at the top of the output.
784 *
785 * This list-traversal primitive may safely run concurrently with
786 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
787 * as long as the traversal is guarded by rcu_read_lock().
788 */
789struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head,
790 loff_t pos)
791{
792 if (!pos)
793 return SEQ_START_TOKEN;
794
795 return seq_hlist_start_rcu(head, pos - 1);
796}
797EXPORT_SYMBOL(seq_hlist_start_head_rcu);
798
799/**
800 * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU
801 * @v: the current iterator
802 * @head: the head of the hlist
803 * @ppos: the current position
804 *
805 * Called at seq_file->op->next().
806 *
807 * This list-traversal primitive may safely run concurrently with
808 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
809 * as long as the traversal is guarded by rcu_read_lock().
810 */
811struct hlist_node *seq_hlist_next_rcu(void *v,
812 struct hlist_head *head,
813 loff_t *ppos)
814{
815 struct hlist_node *node = v;
816
817 ++*ppos;
818 if (v == SEQ_START_TOKEN)
819 return rcu_dereference(head->first);
820 else
821 return rcu_dereference(node->next);
822}
823EXPORT_SYMBOL(seq_hlist_next_rcu);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1dabe4ee02fe..f329849ce3c0 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -22,6 +22,7 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/signal.h> 27#include <linux/signal.h>
27#include <linux/list.h> 28#include <linux/list.h>
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 92d5e8ffb639..dbf6548bbf06 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -13,7 +13,6 @@
13#include <linux/fcntl.h> 13#include <linux/fcntl.h>
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/slab.h>
17#include <linux/pagemap.h> 16#include <linux/pagemap.h>
18#include <linux/smp_lock.h> 17#include <linux/smp_lock.h>
19#include <linux/net.h> 18#include <linux/net.h>
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 1c4c8f089970..dfa1d67f8fca 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -479,6 +479,7 @@ smb_put_super(struct super_block *sb)
479 if (server->conn_pid) 479 if (server->conn_pid)
480 kill_pid(server->conn_pid, SIGTERM, 1); 480 kill_pid(server->conn_pid, SIGTERM, 1);
481 481
482 bdi_destroy(&server->bdi);
482 kfree(server->ops); 483 kfree(server->ops);
483 smb_unload_nls(server); 484 smb_unload_nls(server);
484 sb->s_fs_info = NULL; 485 sb->s_fs_info = NULL;
@@ -525,6 +526,11 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
525 if (!server) 526 if (!server)
526 goto out_no_server; 527 goto out_no_server;
527 sb->s_fs_info = server; 528 sb->s_fs_info = server;
529
530 if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
531 goto out_bdi;
532
533 sb->s_bdi = &server->bdi;
528 534
529 server->super_block = sb; 535 server->super_block = sb;
530 server->mnt = NULL; 536 server->mnt = NULL;
@@ -624,6 +630,8 @@ out_no_smbiod:
624out_bad_option: 630out_bad_option:
625 kfree(mem); 631 kfree(mem);
626out_no_mem: 632out_no_mem:
633 bdi_destroy(&server->bdi);
634out_bdi:
627 if (!server->mnt) 635 if (!server->mnt)
628 printk(KERN_ERR "smb_fill_super: allocation failure\n"); 636 printk(KERN_ERR "smb_fill_super: allocation failure\n");
629 sb->s_fs_info = NULL; 637 sb->s_fs_info = NULL;
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 6bd9b691a463..0e39a924f10a 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -12,7 +12,6 @@
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/stat.h> 13#include <linux/stat.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/slab.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/file.h> 16#include <linux/file.h>
18#include <linux/dcache.h> 17#include <linux/dcache.h>
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 00b2909bd469..54350b59046b 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,6 +15,7 @@
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/net.h> 16#include <linux/net.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
18 19
19#include <asm/uaccess.h> 20#include <asm/uaccess.h>
20#include <asm/system.h> 21#include <asm/system.h>
diff --git a/fs/splice.c b/fs/splice.c
index 39208663aaf1..9313b6124a2e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -30,6 +30,7 @@
30#include <linux/syscalls.h> 30#include <linux/syscalls.h>
31#include <linux/uio.h> 31#include <linux/uio.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/gfp.h>
33 34
34/* 35/*
35 * Attempt to steal a page from a pipe buffer. This should perhaps go into 36 * Attempt to steal a page from a pipe buffer. This should perhaps go into
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 70e3244fa30f..df8a19ef870d 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -4,4 +4,4 @@
4 4
5obj-$(CONFIG_SQUASHFS) += squashfs.o 5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o 6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o 7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2a7960310349..653c030eb840 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -29,15 +29,14 @@
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/vfs.h> 30#include <linux/vfs.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/mutex.h>
33#include <linux/string.h> 32#include <linux/string.h>
34#include <linux/buffer_head.h> 33#include <linux/buffer_head.h>
35#include <linux/zlib.h>
36 34
37#include "squashfs_fs.h" 35#include "squashfs_fs.h"
38#include "squashfs_fs_sb.h" 36#include "squashfs_fs_sb.h"
39#include "squashfs_fs_i.h" 37#include "squashfs_fs_i.h"
40#include "squashfs.h" 38#include "squashfs.h"
39#include "decompressor.h"
41 40
42/* 41/*
43 * Read the metadata block length, this is stored in the first two 42 * Read the metadata block length, this is stored in the first two
@@ -88,9 +87,8 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
88 u64 cur_index = index >> msblk->devblksize_log2; 87 u64 cur_index = index >> msblk->devblksize_log2;
89 int bytes, compressed, b = 0, k = 0, page = 0, avail; 88 int bytes, compressed, b = 0, k = 0, page = 0, avail;
90 89
91 90 bh = kcalloc(((srclength + msblk->devblksize - 1)
92 bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1, 91 >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
93 sizeof(*bh), GFP_KERNEL);
94 if (bh == NULL) 92 if (bh == NULL)
95 return -ENOMEM; 93 return -ENOMEM;
96 94
@@ -153,72 +151,10 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
153 } 151 }
154 152
155 if (compressed) { 153 if (compressed) {
156 int zlib_err = 0, zlib_init = 0; 154 length = squashfs_decompress(msblk, buffer, bh, b, offset,
157 155 length, srclength, pages);
158 /* 156 if (length < 0)
159 * Uncompress block. 157 goto read_failure;
160 */
161
162 mutex_lock(&msblk->read_data_mutex);
163
164 msblk->stream.avail_out = 0;
165 msblk->stream.avail_in = 0;
166
167 bytes = length;
168 do {
169 if (msblk->stream.avail_in == 0 && k < b) {
170 avail = min(bytes, msblk->devblksize - offset);
171 bytes -= avail;
172 wait_on_buffer(bh[k]);
173 if (!buffer_uptodate(bh[k]))
174 goto release_mutex;
175
176 if (avail == 0) {
177 offset = 0;
178 put_bh(bh[k++]);
179 continue;
180 }
181
182 msblk->stream.next_in = bh[k]->b_data + offset;
183 msblk->stream.avail_in = avail;
184 offset = 0;
185 }
186
187 if (msblk->stream.avail_out == 0 && page < pages) {
188 msblk->stream.next_out = buffer[page++];
189 msblk->stream.avail_out = PAGE_CACHE_SIZE;
190 }
191
192 if (!zlib_init) {
193 zlib_err = zlib_inflateInit(&msblk->stream);
194 if (zlib_err != Z_OK) {
195 ERROR("zlib_inflateInit returned"
196 " unexpected result 0x%x,"
197 " srclength %d\n", zlib_err,
198 srclength);
199 goto release_mutex;
200 }
201 zlib_init = 1;
202 }
203
204 zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH);
205
206 if (msblk->stream.avail_in == 0 && k < b)
207 put_bh(bh[k++]);
208 } while (zlib_err == Z_OK);
209
210 if (zlib_err != Z_STREAM_END) {
211 ERROR("zlib_inflate error, data probably corrupt\n");
212 goto release_mutex;
213 }
214
215 zlib_err = zlib_inflateEnd(&msblk->stream);
216 if (zlib_err != Z_OK) {
217 ERROR("zlib_inflate error, data probably corrupt\n");
218 goto release_mutex;
219 }
220 length = msblk->stream.total_out;
221 mutex_unlock(&msblk->read_data_mutex);
222 } else { 158 } else {
223 /* 159 /*
224 * Block is uncompressed. 160 * Block is uncompressed.
@@ -255,9 +191,6 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
255 kfree(bh); 191 kfree(bh);
256 return length; 192 return length;
257 193
258release_mutex:
259 mutex_unlock(&msblk->read_data_mutex);
260
261block_release: 194block_release:
262 for (; k < b; k++) 195 for (; k < b; k++)
263 put_bh(bh[k]); 196 put_bh(bh[k]);
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 40c98fa6b5d6..57314bee9059 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -51,7 +51,6 @@
51#include <linux/sched.h> 51#include <linux/sched.h>
52#include <linux/spinlock.h> 52#include <linux/spinlock.h>
53#include <linux/wait.h> 53#include <linux/wait.h>
54#include <linux/zlib.h>
55#include <linux/pagemap.h> 54#include <linux/pagemap.h>
56 55
57#include "squashfs_fs.h" 56#include "squashfs_fs.h"
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
new file mode 100644
index 000000000000..157478da6ac9
--- /dev/null
+++ b/fs/squashfs/decompressor.c
@@ -0,0 +1,68 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * decompressor.c
22 */
23
24#include <linux/types.h>
25#include <linux/mutex.h>
26#include <linux/buffer_head.h>
27
28#include "squashfs_fs.h"
29#include "squashfs_fs_sb.h"
30#include "squashfs_fs_i.h"
31#include "decompressor.h"
32#include "squashfs.h"
33
34/*
35 * This file (and decompressor.h) implements a decompressor framework for
36 * Squashfs, allowing multiple decompressors to be easily supported
37 */
38
39static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
40 NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
41};
42
43static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
44 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
45};
46
47static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
48 NULL, NULL, NULL, 0, "unknown", 0
49};
50
51static const struct squashfs_decompressor *decompressor[] = {
52 &squashfs_zlib_comp_ops,
53 &squashfs_lzma_unsupported_comp_ops,
54 &squashfs_lzo_unsupported_comp_ops,
55 &squashfs_unknown_comp_ops
56};
57
58
59const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
60{
61 int i;
62
63 for (i = 0; decompressor[i]->id; i++)
64 if (id == decompressor[i]->id)
65 break;
66
67 return decompressor[i];
68}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
new file mode 100644
index 000000000000..7425f80783f6
--- /dev/null
+++ b/fs/squashfs/decompressor.h
@@ -0,0 +1,55 @@
1#ifndef DECOMPRESSOR_H
2#define DECOMPRESSOR_H
3/*
4 * Squashfs - a compressed read only filesystem for Linux
5 *
6 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
7 * Phillip Lougher <phillip@lougher.demon.co.uk>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2,
12 * or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 *
23 * decompressor.h
24 */
25
26struct squashfs_decompressor {
27 void *(*init)(struct squashfs_sb_info *);
28 void (*free)(void *);
29 int (*decompress)(struct squashfs_sb_info *, void **,
30 struct buffer_head **, int, int, int, int, int);
31 int id;
32 char *name;
33 int supported;
34};
35
36static inline void *squashfs_decompressor_init(struct squashfs_sb_info *msblk)
37{
38 return msblk->decompressor->init(msblk);
39}
40
41static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
42 void *s)
43{
44 if (msblk->decompressor)
45 msblk->decompressor->free(s);
46}
47
48static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
49 void **buffer, struct buffer_head **bh, int b, int offset, int length,
50 int srclength, int pages)
51{
52 return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
53 length, srclength, pages);
54}
55#endif
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 566b0eaed868..12b933ac6585 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -30,7 +30,6 @@
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/zlib.h>
34 33
35#include "squashfs_fs.h" 34#include "squashfs_fs.h"
36#include "squashfs_fs_sb.h" 35#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 2b1b8fe5e037..7f93d5a9ee05 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -39,7 +39,6 @@
39#include <linux/vfs.h> 39#include <linux/vfs.h>
40#include <linux/dcache.h> 40#include <linux/dcache.h>
41#include <linux/exportfs.h> 41#include <linux/exportfs.h>
42#include <linux/zlib.h>
43#include <linux/slab.h> 42#include <linux/slab.h>
44 43
45#include "squashfs_fs.h" 44#include "squashfs_fs.h"
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 717767d831df..a25c5060bdcb 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -47,7 +47,6 @@
47#include <linux/string.h> 47#include <linux/string.h>
48#include <linux/pagemap.h> 48#include <linux/pagemap.h>
49#include <linux/mutex.h> 49#include <linux/mutex.h>
50#include <linux/zlib.h>
51 50
52#include "squashfs_fs.h" 51#include "squashfs_fs.h"
53#include "squashfs_fs_sb.h" 52#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index b5a2c15bbbc7..7c90bbd6879d 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -36,7 +36,6 @@
36#include <linux/fs.h> 36#include <linux/fs.h>
37#include <linux/vfs.h> 37#include <linux/vfs.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/zlib.h>
40 39
41#include "squashfs_fs.h" 40#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index 3795b837ba28..b7f64bcd2b70 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -34,7 +34,6 @@
34#include <linux/fs.h> 34#include <linux/fs.h>
35#include <linux/vfs.h> 35#include <linux/vfs.h>
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include <linux/zlib.h>
38 37
39#include "squashfs_fs.h" 38#include "squashfs_fs.h"
40#include "squashfs_fs_sb.h" 39#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 9101dbde39ec..49daaf669e41 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,7 +40,6 @@
40 40
41#include <linux/fs.h> 41#include <linux/fs.h>
42#include <linux/vfs.h> 42#include <linux/vfs.h>
43#include <linux/zlib.h>
44 43
45#include "squashfs_fs.h" 44#include "squashfs_fs.h"
46#include "squashfs_fs_sb.h" 45#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 9e398653b22b..5266bd8ad932 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,7 +57,6 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/string.h> 58#include <linux/string.h>
59#include <linux/dcache.h> 59#include <linux/dcache.h>
60#include <linux/zlib.h>
61 60
62#include "squashfs_fs.h" 61#include "squashfs_fs.h"
63#include "squashfs_fs_sb.h" 62#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 0e9feb6adf7e..fe2587af5512 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -51,6 +51,9 @@ extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
51 u64, int); 51 u64, int);
52extern int squashfs_read_table(struct super_block *, void *, u64, int); 52extern int squashfs_read_table(struct super_block *, void *, u64, int);
53 53
54/* decompressor.c */
55extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
56
54/* export.c */ 57/* export.c */
55extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, 58extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
56 unsigned int); 59 unsigned int);
@@ -71,7 +74,7 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
71extern int squashfs_read_inode(struct inode *, long long); 74extern int squashfs_read_inode(struct inode *, long long);
72 75
73/* 76/*
74 * Inodes and files operations 77 * Inodes, files and decompressor operations
75 */ 78 */
76 79
77/* dir.c */ 80/* dir.c */
@@ -88,3 +91,6 @@ extern const struct inode_operations squashfs_dir_inode_ops;
88 91
89/* symlink.c */ 92/* symlink.c */
90extern const struct address_space_operations squashfs_symlink_aops; 93extern const struct address_space_operations squashfs_symlink_aops;
94
95/* zlib_wrapper.c */
96extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 283daafc568e..79024245ea00 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -183,8 +183,6 @@
183#define SQUASHFS_MAX_FILE_SIZE (1LL << \ 183#define SQUASHFS_MAX_FILE_SIZE (1LL << \
184 (SQUASHFS_MAX_FILE_SIZE_LOG - 2)) 184 (SQUASHFS_MAX_FILE_SIZE_LOG - 2))
185 185
186#define SQUASHFS_MARKER_BYTE 0xff
187
188/* meta index cache */ 186/* meta index cache */
189#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) 187#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
190#define SQUASHFS_META_ENTRIES 127 188#define SQUASHFS_META_ENTRIES 127
@@ -211,7 +209,9 @@ struct meta_index {
211/* 209/*
212 * definitions for structures on disk 210 * definitions for structures on disk
213 */ 211 */
214#define ZLIB_COMPRESSION 1 212#define ZLIB_COMPRESSION 1
213#define LZMA_COMPRESSION 2
214#define LZO_COMPRESSION 3
215 215
216struct squashfs_super_block { 216struct squashfs_super_block {
217 __le32 s_magic; 217 __le32 s_magic;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index c8c65614dd1c..2e77dc547e25 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -52,25 +52,25 @@ struct squashfs_cache_entry {
52}; 52};
53 53
54struct squashfs_sb_info { 54struct squashfs_sb_info {
55 int devblksize; 55 const struct squashfs_decompressor *decompressor;
56 int devblksize_log2; 56 int devblksize;
57 struct squashfs_cache *block_cache; 57 int devblksize_log2;
58 struct squashfs_cache *fragment_cache; 58 struct squashfs_cache *block_cache;
59 struct squashfs_cache *read_page; 59 struct squashfs_cache *fragment_cache;
60 int next_meta_index; 60 struct squashfs_cache *read_page;
61 __le64 *id_table; 61 int next_meta_index;
62 __le64 *fragment_index; 62 __le64 *id_table;
63 unsigned int *fragment_index_2; 63 __le64 *fragment_index;
64 struct mutex read_data_mutex; 64 struct mutex read_data_mutex;
65 struct mutex meta_index_mutex; 65 struct mutex meta_index_mutex;
66 struct meta_index *meta_index; 66 struct meta_index *meta_index;
67 z_stream stream; 67 void *stream;
68 __le64 *inode_lookup_table; 68 __le64 *inode_lookup_table;
69 u64 inode_table; 69 u64 inode_table;
70 u64 directory_table; 70 u64 directory_table;
71 unsigned int block_size; 71 unsigned int block_size;
72 unsigned short block_log; 72 unsigned short block_log;
73 long long bytes_used; 73 long long bytes_used;
74 unsigned int inodes; 74 unsigned int inodes;
75}; 75};
76#endif 76#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6c197ef53add..48b6f4a385a6 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -35,34 +35,41 @@
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/module.h> 37#include <linux/module.h>
38#include <linux/zlib.h>
39#include <linux/magic.h> 38#include <linux/magic.h>
40 39
41#include "squashfs_fs.h" 40#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
43#include "squashfs_fs_i.h" 42#include "squashfs_fs_i.h"
44#include "squashfs.h" 43#include "squashfs.h"
44#include "decompressor.h"
45 45
46static struct file_system_type squashfs_fs_type; 46static struct file_system_type squashfs_fs_type;
47static const struct super_operations squashfs_super_ops; 47static const struct super_operations squashfs_super_ops;
48 48
49static int supported_squashfs_filesystem(short major, short minor, short comp) 49static const struct squashfs_decompressor *supported_squashfs_filesystem(short
50 major, short minor, short id)
50{ 51{
52 const struct squashfs_decompressor *decompressor;
53
51 if (major < SQUASHFS_MAJOR) { 54 if (major < SQUASHFS_MAJOR) {
52 ERROR("Major/Minor mismatch, older Squashfs %d.%d " 55 ERROR("Major/Minor mismatch, older Squashfs %d.%d "
53 "filesystems are unsupported\n", major, minor); 56 "filesystems are unsupported\n", major, minor);
54 return -EINVAL; 57 return NULL;
55 } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) { 58 } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
56 ERROR("Major/Minor mismatch, trying to mount newer " 59 ERROR("Major/Minor mismatch, trying to mount newer "
57 "%d.%d filesystem\n", major, minor); 60 "%d.%d filesystem\n", major, minor);
58 ERROR("Please update your kernel\n"); 61 ERROR("Please update your kernel\n");
59 return -EINVAL; 62 return NULL;
60 } 63 }
61 64
62 if (comp != ZLIB_COMPRESSION) 65 decompressor = squashfs_lookup_decompressor(id);
63 return -EINVAL; 66 if (!decompressor->supported) {
67 ERROR("Filesystem uses \"%s\" compression. This is not "
68 "supported\n", decompressor->name);
69 return NULL;
70 }
64 71
65 return 0; 72 return decompressor;
66} 73}
67 74
68 75
@@ -87,13 +94,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
87 } 94 }
88 msblk = sb->s_fs_info; 95 msblk = sb->s_fs_info;
89 96
90 msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(),
91 GFP_KERNEL);
92 if (msblk->stream.workspace == NULL) {
93 ERROR("Failed to allocate zlib workspace\n");
94 goto failure;
95 }
96
97 sblk = kzalloc(sizeof(*sblk), GFP_KERNEL); 97 sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
98 if (sblk == NULL) { 98 if (sblk == NULL) {
99 ERROR("Failed to allocate squashfs_super_block\n"); 99 ERROR("Failed to allocate squashfs_super_block\n");
@@ -120,25 +120,25 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
120 goto failed_mount; 120 goto failed_mount;
121 } 121 }
122 122
123 err = -EINVAL;
124
123 /* Check it is a SQUASHFS superblock */ 125 /* Check it is a SQUASHFS superblock */
124 sb->s_magic = le32_to_cpu(sblk->s_magic); 126 sb->s_magic = le32_to_cpu(sblk->s_magic);
125 if (sb->s_magic != SQUASHFS_MAGIC) { 127 if (sb->s_magic != SQUASHFS_MAGIC) {
126 if (!silent) 128 if (!silent)
127 ERROR("Can't find a SQUASHFS superblock on %s\n", 129 ERROR("Can't find a SQUASHFS superblock on %s\n",
128 bdevname(sb->s_bdev, b)); 130 bdevname(sb->s_bdev, b));
129 err = -EINVAL;
130 goto failed_mount; 131 goto failed_mount;
131 } 132 }
132 133
133 /* Check the MAJOR & MINOR versions and compression type */ 134 /* Check the MAJOR & MINOR versions and lookup compression type */
134 err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major), 135 msblk->decompressor = supported_squashfs_filesystem(
136 le16_to_cpu(sblk->s_major),
135 le16_to_cpu(sblk->s_minor), 137 le16_to_cpu(sblk->s_minor),
136 le16_to_cpu(sblk->compression)); 138 le16_to_cpu(sblk->compression));
137 if (err < 0) 139 if (msblk->decompressor == NULL)
138 goto failed_mount; 140 goto failed_mount;
139 141
140 err = -EINVAL;
141
142 /* 142 /*
143 * Check if there's xattrs in the filesystem. These are not 143 * Check if there's xattrs in the filesystem. These are not
144 * supported in this version, so warn that they will be ignored. 144 * supported in this version, so warn that they will be ignored.
@@ -205,6 +205,10 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
205 205
206 err = -ENOMEM; 206 err = -ENOMEM;
207 207
208 msblk->stream = squashfs_decompressor_init(msblk);
209 if (msblk->stream == NULL)
210 goto failed_mount;
211
208 msblk->block_cache = squashfs_cache_init("metadata", 212 msblk->block_cache = squashfs_cache_init("metadata",
209 SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); 213 SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
210 if (msblk->block_cache == NULL) 214 if (msblk->block_cache == NULL)
@@ -271,7 +275,8 @@ allocate_root:
271 275
272 err = squashfs_read_inode(root, root_inode); 276 err = squashfs_read_inode(root, root_inode);
273 if (err) { 277 if (err) {
274 iget_failed(root); 278 make_bad_inode(root);
279 iput(root);
275 goto failed_mount; 280 goto failed_mount;
276 } 281 }
277 insert_inode_hash(root); 282 insert_inode_hash(root);
@@ -292,17 +297,16 @@ failed_mount:
292 squashfs_cache_delete(msblk->block_cache); 297 squashfs_cache_delete(msblk->block_cache);
293 squashfs_cache_delete(msblk->fragment_cache); 298 squashfs_cache_delete(msblk->fragment_cache);
294 squashfs_cache_delete(msblk->read_page); 299 squashfs_cache_delete(msblk->read_page);
300 squashfs_decompressor_free(msblk, msblk->stream);
295 kfree(msblk->inode_lookup_table); 301 kfree(msblk->inode_lookup_table);
296 kfree(msblk->fragment_index); 302 kfree(msblk->fragment_index);
297 kfree(msblk->id_table); 303 kfree(msblk->id_table);
298 kfree(msblk->stream.workspace);
299 kfree(sb->s_fs_info); 304 kfree(sb->s_fs_info);
300 sb->s_fs_info = NULL; 305 sb->s_fs_info = NULL;
301 kfree(sblk); 306 kfree(sblk);
302 return err; 307 return err;
303 308
304failure: 309failure:
305 kfree(msblk->stream.workspace);
306 kfree(sb->s_fs_info); 310 kfree(sb->s_fs_info);
307 sb->s_fs_info = NULL; 311 sb->s_fs_info = NULL;
308 return -ENOMEM; 312 return -ENOMEM;
@@ -346,10 +350,11 @@ static void squashfs_put_super(struct super_block *sb)
346 squashfs_cache_delete(sbi->block_cache); 350 squashfs_cache_delete(sbi->block_cache);
347 squashfs_cache_delete(sbi->fragment_cache); 351 squashfs_cache_delete(sbi->fragment_cache);
348 squashfs_cache_delete(sbi->read_page); 352 squashfs_cache_delete(sbi->read_page);
353 squashfs_decompressor_free(sbi, sbi->stream);
349 kfree(sbi->id_table); 354 kfree(sbi->id_table);
350 kfree(sbi->fragment_index); 355 kfree(sbi->fragment_index);
351 kfree(sbi->meta_index); 356 kfree(sbi->meta_index);
352 kfree(sbi->stream.workspace); 357 kfree(sbi->inode_lookup_table);
353 kfree(sb->s_fs_info); 358 kfree(sb->s_fs_info);
354 sb->s_fs_info = NULL; 359 sb->s_fs_info = NULL;
355 } 360 }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 83d87880aac8..32b911f4ee39 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -33,10 +33,8 @@
33#include <linux/fs.h> 33#include <linux/fs.h>
34#include <linux/vfs.h> 34#include <linux/vfs.h>
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/slab.h>
37#include <linux/string.h> 36#include <linux/string.h>
38#include <linux/pagemap.h> 37#include <linux/pagemap.h>
39#include <linux/zlib.h>
40 38
41#include "squashfs_fs.h" 39#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h" 40#include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
new file mode 100644
index 000000000000..7a603874e483
--- /dev/null
+++ b/fs/squashfs/zlib_wrapper.c
@@ -0,0 +1,152 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * zlib_wrapper.c
22 */
23
24
25#include <linux/mutex.h>
26#include <linux/buffer_head.h>
27#include <linux/slab.h>
28#include <linux/zlib.h>
29
30#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h"
34#include "decompressor.h"
35
36static void *zlib_init(struct squashfs_sb_info *dummy)
37{
38 z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
39 if (stream == NULL)
40 goto failed;
41 stream->workspace = kmalloc(zlib_inflate_workspacesize(),
42 GFP_KERNEL);
43 if (stream->workspace == NULL)
44 goto failed;
45
46 return stream;
47
48failed:
49 ERROR("Failed to allocate zlib workspace\n");
50 kfree(stream);
51 return NULL;
52}
53
54
55static void zlib_free(void *strm)
56{
57 z_stream *stream = strm;
58
59 if (stream)
60 kfree(stream->workspace);
61 kfree(stream);
62}
63
64
65static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
66 struct buffer_head **bh, int b, int offset, int length, int srclength,
67 int pages)
68{
69 int zlib_err = 0, zlib_init = 0;
70 int avail, bytes, k = 0, page = 0;
71 z_stream *stream = msblk->stream;
72
73 mutex_lock(&msblk->read_data_mutex);
74
75 stream->avail_out = 0;
76 stream->avail_in = 0;
77
78 bytes = length;
79 do {
80 if (stream->avail_in == 0 && k < b) {
81 avail = min(bytes, msblk->devblksize - offset);
82 bytes -= avail;
83 wait_on_buffer(bh[k]);
84 if (!buffer_uptodate(bh[k]))
85 goto release_mutex;
86
87 if (avail == 0) {
88 offset = 0;
89 put_bh(bh[k++]);
90 continue;
91 }
92
93 stream->next_in = bh[k]->b_data + offset;
94 stream->avail_in = avail;
95 offset = 0;
96 }
97
98 if (stream->avail_out == 0 && page < pages) {
99 stream->next_out = buffer[page++];
100 stream->avail_out = PAGE_CACHE_SIZE;
101 }
102
103 if (!zlib_init) {
104 zlib_err = zlib_inflateInit(stream);
105 if (zlib_err != Z_OK) {
106 ERROR("zlib_inflateInit returned unexpected "
107 "result 0x%x, srclength %d\n",
108 zlib_err, srclength);
109 goto release_mutex;
110 }
111 zlib_init = 1;
112 }
113
114 zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH);
115
116 if (stream->avail_in == 0 && k < b)
117 put_bh(bh[k++]);
118 } while (zlib_err == Z_OK);
119
120 if (zlib_err != Z_STREAM_END) {
121 ERROR("zlib_inflate error, data probably corrupt\n");
122 goto release_mutex;
123 }
124
125 zlib_err = zlib_inflateEnd(stream);
126 if (zlib_err != Z_OK) {
127 ERROR("zlib_inflate error, data probably corrupt\n");
128 goto release_mutex;
129 }
130
131 length = stream->total_out;
132 mutex_unlock(&msblk->read_data_mutex);
133 return length;
134
135release_mutex:
136 mutex_unlock(&msblk->read_data_mutex);
137
138 for (; k < b; k++)
139 put_bh(bh[k]);
140
141 return -EIO;
142}
143
144const struct squashfs_decompressor squashfs_zlib_comp_ops = {
145 .init = zlib_init,
146 .free = zlib_free,
147 .decompress = zlib_uncompress,
148 .id = ZLIB_COMPRESSION,
149 .name = "zlib",
150 .supported = 1
151};
152
diff --git a/fs/super.c b/fs/super.c
index aff046b0fe78..dc72491a19f9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -568,7 +568,7 @@ out:
568int do_remount_sb(struct super_block *sb, int flags, void *data, int force) 568int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
569{ 569{
570 int retval; 570 int retval;
571 int remount_rw; 571 int remount_rw, remount_ro;
572 572
573 if (sb->s_frozen != SB_UNFROZEN) 573 if (sb->s_frozen != SB_UNFROZEN)
574 return -EBUSY; 574 return -EBUSY;
@@ -583,9 +583,12 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
583 shrink_dcache_sb(sb); 583 shrink_dcache_sb(sb);
584 sync_filesystem(sb); 584 sync_filesystem(sb);
585 585
586 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
587 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
588
586 /* If we are remounting RDONLY and current sb is read/write, 589 /* If we are remounting RDONLY and current sb is read/write,
587 make sure there are no rw files opened */ 590 make sure there are no rw files opened */
588 if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) { 591 if (remount_ro) {
589 if (force) 592 if (force)
590 mark_files_ro(sb); 593 mark_files_ro(sb);
591 else if (!fs_may_remount_ro(sb)) 594 else if (!fs_may_remount_ro(sb))
@@ -594,7 +597,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
594 if (retval < 0 && retval != -ENOSYS) 597 if (retval < 0 && retval != -ENOSYS)
595 return -EBUSY; 598 return -EBUSY;
596 } 599 }
597 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
598 600
599 if (sb->s_op->remount_fs) { 601 if (sb->s_op->remount_fs) {
600 retval = sb->s_op->remount_fs(sb, &flags, data); 602 retval = sb->s_op->remount_fs(sb, &flags, data);
@@ -604,6 +606,16 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
604 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); 606 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
605 if (remount_rw) 607 if (remount_rw)
606 vfs_dq_quota_on_remount(sb); 608 vfs_dq_quota_on_remount(sb);
609 /*
610 * Some filesystems modify their metadata via some other path than the
611 * bdev buffer cache (eg. use a private mapping, or directories in
612 * pagecache, etc). Also file data modifications go via their own
613 * mappings. So If we try to mount readonly then copy the filesystem
614 * from bdev, we could get stale data, so invalidate it to give a best
615 * effort at coherency.
616 */
617 if (remount_ro && sb->s_bdev)
618 invalidate_bdev(sb->s_bdev);
607 return 0; 619 return 0;
608} 620}
609 621
@@ -681,6 +693,7 @@ int set_anon_super(struct super_block *s, void *data)
681 return -EMFILE; 693 return -EMFILE;
682 } 694 }
683 s->s_dev = MKDEV(0, dev & MINORMASK); 695 s->s_dev = MKDEV(0, dev & MINORMASK);
696 s->s_bdi = &noop_backing_dev_info;
684 return 0; 697 return 0;
685} 698}
686 699
@@ -925,6 +938,9 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
925 if (!mnt) 938 if (!mnt)
926 goto out; 939 goto out;
927 940
941 if (flags & MS_KERNMOUNT)
942 mnt->mnt_flags = MNT_INTERNAL;
943
928 if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) { 944 if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
929 secdata = alloc_secdata(); 945 secdata = alloc_secdata();
930 if (!secdata) 946 if (!secdata)
@@ -939,10 +955,11 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
939 if (error < 0) 955 if (error < 0)
940 goto out_free_secdata; 956 goto out_free_secdata;
941 BUG_ON(!mnt->mnt_sb); 957 BUG_ON(!mnt->mnt_sb);
958 WARN_ON(!mnt->mnt_sb->s_bdi);
942 959
943 error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); 960 error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
944 if (error) 961 if (error)
945 goto out_sb; 962 goto out_sb;
946 963
947 /* 964 /*
948 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE 965 * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
diff --git a/fs/sync.c b/fs/sync.c
index 418727a2a239..92b228176f7c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -5,6 +5,7 @@
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/file.h> 6#include <linux/file.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/slab.h>
8#include <linux/module.h> 9#include <linux/module.h>
9#include <linux/sched.h> 10#include <linux/sched.h>
10#include <linux/writeback.h> 11#include <linux/writeback.h>
@@ -13,6 +14,7 @@
13#include <linux/pagemap.h> 14#include <linux/pagemap.h>
14#include <linux/quotaops.h> 15#include <linux/quotaops.h>
15#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
17#include <linux/backing-dev.h>
16#include "internal.h" 18#include "internal.h"
17 19
18#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ 20#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
@@ -31,17 +33,17 @@ static int __sync_filesystem(struct super_block *sb, int wait)
31 * This should be safe, as we require bdi backing to actually 33 * This should be safe, as we require bdi backing to actually
32 * write out data in the first place 34 * write out data in the first place
33 */ 35 */
34 if (!sb->s_bdi) 36 if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info)
35 return 0; 37 return 0;
36 38
37 /* Avoid doing twice syncing and cache pruning for quota sync */ 39 if (sb->s_qcop && sb->s_qcop->quota_sync)
38 if (!wait) { 40 sb->s_qcop->quota_sync(sb, -1, wait);
39 writeout_quota_sb(sb, -1); 41
40 writeback_inodes_sb(sb); 42 if (wait)
41 } else {
42 sync_quota_sb(sb, -1);
43 sync_inodes_sb(sb); 43 sync_inodes_sb(sb);
44 } 44 else
45 writeback_inodes_sb(sb);
46
45 if (sb->s_op->sync_fs) 47 if (sb->s_op->sync_fs)
46 sb->s_op->sync_fs(sb, wait); 48 sb->s_op->sync_fs(sb, wait);
47 return __sync_blockdev(sb->s_bdev, wait); 49 return __sync_blockdev(sb->s_bdev, wait);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index a0a500af24a1..e9d293593e52 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -54,14 +54,14 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
54 int rc; 54 int rc;
55 55
56 /* need attr_sd for attr, its parent for kobj */ 56 /* need attr_sd for attr, its parent for kobj */
57 if (!sysfs_get_active_two(attr_sd)) 57 if (!sysfs_get_active(attr_sd))
58 return -ENODEV; 58 return -ENODEV;
59 59
60 rc = -EIO; 60 rc = -EIO;
61 if (attr->read) 61 if (attr->read)
62 rc = attr->read(kobj, attr, buffer, off, count); 62 rc = attr->read(kobj, attr, buffer, off, count);
63 63
64 sysfs_put_active_two(attr_sd); 64 sysfs_put_active(attr_sd);
65 65
66 return rc; 66 return rc;
67} 67}
@@ -125,14 +125,14 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
125 int rc; 125 int rc;
126 126
127 /* need attr_sd for attr, its parent for kobj */ 127 /* need attr_sd for attr, its parent for kobj */
128 if (!sysfs_get_active_two(attr_sd)) 128 if (!sysfs_get_active(attr_sd))
129 return -ENODEV; 129 return -ENODEV;
130 130
131 rc = -EIO; 131 rc = -EIO;
132 if (attr->write) 132 if (attr->write)
133 rc = attr->write(kobj, attr, buffer, offset, count); 133 rc = attr->write(kobj, attr, buffer, offset, count);
134 134
135 sysfs_put_active_two(attr_sd); 135 sysfs_put_active(attr_sd);
136 136
137 return rc; 137 return rc;
138} 138}
@@ -184,12 +184,12 @@ static void bin_vma_open(struct vm_area_struct *vma)
184 if (!bb->vm_ops || !bb->vm_ops->open) 184 if (!bb->vm_ops || !bb->vm_ops->open)
185 return; 185 return;
186 186
187 if (!sysfs_get_active_two(attr_sd)) 187 if (!sysfs_get_active(attr_sd))
188 return; 188 return;
189 189
190 bb->vm_ops->open(vma); 190 bb->vm_ops->open(vma);
191 191
192 sysfs_put_active_two(attr_sd); 192 sysfs_put_active(attr_sd);
193} 193}
194 194
195static void bin_vma_close(struct vm_area_struct *vma) 195static void bin_vma_close(struct vm_area_struct *vma)
@@ -201,12 +201,12 @@ static void bin_vma_close(struct vm_area_struct *vma)
201 if (!bb->vm_ops || !bb->vm_ops->close) 201 if (!bb->vm_ops || !bb->vm_ops->close)
202 return; 202 return;
203 203
204 if (!sysfs_get_active_two(attr_sd)) 204 if (!sysfs_get_active(attr_sd))
205 return; 205 return;
206 206
207 bb->vm_ops->close(vma); 207 bb->vm_ops->close(vma);
208 208
209 sysfs_put_active_two(attr_sd); 209 sysfs_put_active(attr_sd);
210} 210}
211 211
212static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 212static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -219,12 +219,12 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
219 if (!bb->vm_ops || !bb->vm_ops->fault) 219 if (!bb->vm_ops || !bb->vm_ops->fault)
220 return VM_FAULT_SIGBUS; 220 return VM_FAULT_SIGBUS;
221 221
222 if (!sysfs_get_active_two(attr_sd)) 222 if (!sysfs_get_active(attr_sd))
223 return VM_FAULT_SIGBUS; 223 return VM_FAULT_SIGBUS;
224 224
225 ret = bb->vm_ops->fault(vma, vmf); 225 ret = bb->vm_ops->fault(vma, vmf);
226 226
227 sysfs_put_active_two(attr_sd); 227 sysfs_put_active(attr_sd);
228 return ret; 228 return ret;
229} 229}
230 230
@@ -241,12 +241,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
241 if (!bb->vm_ops->page_mkwrite) 241 if (!bb->vm_ops->page_mkwrite)
242 return 0; 242 return 0;
243 243
244 if (!sysfs_get_active_two(attr_sd)) 244 if (!sysfs_get_active(attr_sd))
245 return VM_FAULT_SIGBUS; 245 return VM_FAULT_SIGBUS;
246 246
247 ret = bb->vm_ops->page_mkwrite(vma, vmf); 247 ret = bb->vm_ops->page_mkwrite(vma, vmf);
248 248
249 sysfs_put_active_two(attr_sd); 249 sysfs_put_active(attr_sd);
250 return ret; 250 return ret;
251} 251}
252 252
@@ -261,12 +261,12 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr,
261 if (!bb->vm_ops || !bb->vm_ops->access) 261 if (!bb->vm_ops || !bb->vm_ops->access)
262 return -EINVAL; 262 return -EINVAL;
263 263
264 if (!sysfs_get_active_two(attr_sd)) 264 if (!sysfs_get_active(attr_sd))
265 return -EINVAL; 265 return -EINVAL;
266 266
267 ret = bb->vm_ops->access(vma, addr, buf, len, write); 267 ret = bb->vm_ops->access(vma, addr, buf, len, write);
268 268
269 sysfs_put_active_two(attr_sd); 269 sysfs_put_active(attr_sd);
270 return ret; 270 return ret;
271} 271}
272 272
@@ -281,12 +281,12 @@ static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
281 if (!bb->vm_ops || !bb->vm_ops->set_policy) 281 if (!bb->vm_ops || !bb->vm_ops->set_policy)
282 return 0; 282 return 0;
283 283
284 if (!sysfs_get_active_two(attr_sd)) 284 if (!sysfs_get_active(attr_sd))
285 return -EINVAL; 285 return -EINVAL;
286 286
287 ret = bb->vm_ops->set_policy(vma, new); 287 ret = bb->vm_ops->set_policy(vma, new);
288 288
289 sysfs_put_active_two(attr_sd); 289 sysfs_put_active(attr_sd);
290 return ret; 290 return ret;
291} 291}
292 292
@@ -301,12 +301,12 @@ static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
301 if (!bb->vm_ops || !bb->vm_ops->get_policy) 301 if (!bb->vm_ops || !bb->vm_ops->get_policy)
302 return vma->vm_policy; 302 return vma->vm_policy;
303 303
304 if (!sysfs_get_active_two(attr_sd)) 304 if (!sysfs_get_active(attr_sd))
305 return vma->vm_policy; 305 return vma->vm_policy;
306 306
307 pol = bb->vm_ops->get_policy(vma, addr); 307 pol = bb->vm_ops->get_policy(vma, addr);
308 308
309 sysfs_put_active_two(attr_sd); 309 sysfs_put_active(attr_sd);
310 return pol; 310 return pol;
311} 311}
312 312
@@ -321,12 +321,12 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
321 if (!bb->vm_ops || !bb->vm_ops->migrate) 321 if (!bb->vm_ops || !bb->vm_ops->migrate)
322 return 0; 322 return 0;
323 323
324 if (!sysfs_get_active_two(attr_sd)) 324 if (!sysfs_get_active(attr_sd))
325 return 0; 325 return 0;
326 326
327 ret = bb->vm_ops->migrate(vma, from, to, flags); 327 ret = bb->vm_ops->migrate(vma, from, to, flags);
328 328
329 sysfs_put_active_two(attr_sd); 329 sysfs_put_active(attr_sd);
330 return ret; 330 return ret;
331} 331}
332#endif 332#endif
@@ -356,7 +356,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
356 356
357 /* need attr_sd for attr, its parent for kobj */ 357 /* need attr_sd for attr, its parent for kobj */
358 rc = -ENODEV; 358 rc = -ENODEV;
359 if (!sysfs_get_active_two(attr_sd)) 359 if (!sysfs_get_active(attr_sd))
360 goto out_unlock; 360 goto out_unlock;
361 361
362 rc = -EINVAL; 362 rc = -EINVAL;
@@ -384,7 +384,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
384 bb->vm_ops = vma->vm_ops; 384 bb->vm_ops = vma->vm_ops;
385 vma->vm_ops = &bin_vm_ops; 385 vma->vm_ops = &bin_vm_ops;
386out_put: 386out_put:
387 sysfs_put_active_two(attr_sd); 387 sysfs_put_active(attr_sd);
388out_unlock: 388out_unlock:
389 mutex_unlock(&bb->mutex); 389 mutex_unlock(&bb->mutex);
390 390
@@ -399,7 +399,7 @@ static int open(struct inode * inode, struct file * file)
399 int error; 399 int error;
400 400
401 /* binary file operations requires both @sd and its parent */ 401 /* binary file operations requires both @sd and its parent */
402 if (!sysfs_get_active_two(attr_sd)) 402 if (!sysfs_get_active(attr_sd))
403 return -ENODEV; 403 return -ENODEV;
404 404
405 error = -EACCES; 405 error = -EACCES;
@@ -426,11 +426,11 @@ static int open(struct inode * inode, struct file * file)
426 mutex_unlock(&sysfs_bin_lock); 426 mutex_unlock(&sysfs_bin_lock);
427 427
428 /* open succeeded, put active references */ 428 /* open succeeded, put active references */
429 sysfs_put_active_two(attr_sd); 429 sysfs_put_active(attr_sd);
430 return 0; 430 return 0;
431 431
432 err_out: 432 err_out:
433 sysfs_put_active_two(attr_sd); 433 sysfs_put_active(attr_sd);
434 kfree(bb); 434 kfree(bb);
435 return error; 435 return error;
436} 436}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index f05f2303a8b8..590717861c7a 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -93,7 +93,7 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
93 * RETURNS: 93 * RETURNS:
94 * Pointer to @sd on success, NULL on failure. 94 * Pointer to @sd on success, NULL on failure.
95 */ 95 */
96static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) 96struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
97{ 97{
98 if (unlikely(!sd)) 98 if (unlikely(!sd))
99 return NULL; 99 return NULL;
@@ -106,8 +106,10 @@ static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
106 return NULL; 106 return NULL;
107 107
108 t = atomic_cmpxchg(&sd->s_active, v, v + 1); 108 t = atomic_cmpxchg(&sd->s_active, v, v + 1);
109 if (likely(t == v)) 109 if (likely(t == v)) {
110 rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
110 return sd; 111 return sd;
112 }
111 if (t < 0) 113 if (t < 0)
112 return NULL; 114 return NULL;
113 115
@@ -122,7 +124,7 @@ static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
122 * Put an active reference to @sd. This function is noop if @sd 124 * Put an active reference to @sd. This function is noop if @sd
123 * is NULL. 125 * is NULL.
124 */ 126 */
125static void sysfs_put_active(struct sysfs_dirent *sd) 127void sysfs_put_active(struct sysfs_dirent *sd)
126{ 128{
127 struct completion *cmpl; 129 struct completion *cmpl;
128 int v; 130 int v;
@@ -130,6 +132,7 @@ static void sysfs_put_active(struct sysfs_dirent *sd)
130 if (unlikely(!sd)) 132 if (unlikely(!sd))
131 return; 133 return;
132 134
135 rwsem_release(&sd->dep_map, 1, _RET_IP_);
133 v = atomic_dec_return(&sd->s_active); 136 v = atomic_dec_return(&sd->s_active);
134 if (likely(v != SD_DEACTIVATED_BIAS)) 137 if (likely(v != SD_DEACTIVATED_BIAS))
135 return; 138 return;
@@ -142,45 +145,6 @@ static void sysfs_put_active(struct sysfs_dirent *sd)
142} 145}
143 146
144/** 147/**
145 * sysfs_get_active_two - get active references to sysfs_dirent and parent
146 * @sd: sysfs_dirent of interest
147 *
148 * Get active reference to @sd and its parent. Parent's active
149 * reference is grabbed first. This function is noop if @sd is
150 * NULL.
151 *
152 * RETURNS:
153 * Pointer to @sd on success, NULL on failure.
154 */
155struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd)
156{
157 if (sd) {
158 if (sd->s_parent && unlikely(!sysfs_get_active(sd->s_parent)))
159 return NULL;
160 if (unlikely(!sysfs_get_active(sd))) {
161 sysfs_put_active(sd->s_parent);
162 return NULL;
163 }
164 }
165 return sd;
166}
167
168/**
169 * sysfs_put_active_two - put active references to sysfs_dirent and parent
170 * @sd: sysfs_dirent of interest
171 *
172 * Put active references to @sd and its parent. This function is
173 * noop if @sd is NULL.
174 */
175void sysfs_put_active_two(struct sysfs_dirent *sd)
176{
177 if (sd) {
178 sysfs_put_active(sd);
179 sysfs_put_active(sd->s_parent);
180 }
181}
182
183/**
184 * sysfs_deactivate - deactivate sysfs_dirent 148 * sysfs_deactivate - deactivate sysfs_dirent
185 * @sd: sysfs_dirent to deactivate 149 * @sd: sysfs_dirent to deactivate
186 * 150 *
@@ -192,17 +156,27 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
192 int v; 156 int v;
193 157
194 BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED)); 158 BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED));
159
160 if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
161 return;
162
195 sd->s_sibling = (void *)&wait; 163 sd->s_sibling = (void *)&wait;
196 164
165 rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
197 /* atomic_add_return() is a mb(), put_active() will always see 166 /* atomic_add_return() is a mb(), put_active() will always see
198 * the updated sd->s_sibling. 167 * the updated sd->s_sibling.
199 */ 168 */
200 v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); 169 v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
201 170
202 if (v != SD_DEACTIVATED_BIAS) 171 if (v != SD_DEACTIVATED_BIAS) {
172 lock_contended(&sd->dep_map, _RET_IP_);
203 wait_for_completion(&wait); 173 wait_for_completion(&wait);
174 }
204 175
205 sd->s_sibling = NULL; 176 sd->s_sibling = NULL;
177
178 lock_acquired(&sd->dep_map, _RET_IP_);
179 rwsem_release(&sd->dep_map, 1, _RET_IP_);
206} 180}
207 181
208static int sysfs_alloc_ino(ino_t *pino) 182static int sysfs_alloc_ino(ino_t *pino)
@@ -671,7 +645,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
671 } 645 }
672 646
673 /* attach dentry and inode */ 647 /* attach dentry and inode */
674 inode = sysfs_get_inode(sd); 648 inode = sysfs_get_inode(dir->i_sb, sd);
675 if (!inode) { 649 if (!inode) {
676 ret = ERR_PTR(-ENOMEM); 650 ret = ERR_PTR(-ENOMEM);
677 goto out_unlock; 651 goto out_unlock;
@@ -827,11 +801,46 @@ static inline unsigned char dt_type(struct sysfs_dirent *sd)
827 return (sd->s_mode >> 12) & 15; 801 return (sd->s_mode >> 12) & 15;
828} 802}
829 803
804static int sysfs_dir_release(struct inode *inode, struct file *filp)
805{
806 sysfs_put(filp->private_data);
807 return 0;
808}
809
810static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd,
811 ino_t ino, struct sysfs_dirent *pos)
812{
813 if (pos) {
814 int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
815 pos->s_parent == parent_sd &&
816 ino == pos->s_ino;
817 sysfs_put(pos);
818 if (valid)
819 return pos;
820 }
821 pos = NULL;
822 if ((ino > 1) && (ino < INT_MAX)) {
823 pos = parent_sd->s_dir.children;
824 while (pos && (ino > pos->s_ino))
825 pos = pos->s_sibling;
826 }
827 return pos;
828}
829
830static struct sysfs_dirent *sysfs_dir_next_pos(struct sysfs_dirent *parent_sd,
831 ino_t ino, struct sysfs_dirent *pos)
832{
833 pos = sysfs_dir_pos(parent_sd, ino, pos);
834 if (pos)
835 pos = pos->s_sibling;
836 return pos;
837}
838
830static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) 839static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
831{ 840{
832 struct dentry *dentry = filp->f_path.dentry; 841 struct dentry *dentry = filp->f_path.dentry;
833 struct sysfs_dirent * parent_sd = dentry->d_fsdata; 842 struct sysfs_dirent * parent_sd = dentry->d_fsdata;
834 struct sysfs_dirent *pos; 843 struct sysfs_dirent *pos = filp->private_data;
835 ino_t ino; 844 ino_t ino;
836 845
837 if (filp->f_pos == 0) { 846 if (filp->f_pos == 0) {
@@ -847,29 +856,31 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
847 if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0) 856 if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
848 filp->f_pos++; 857 filp->f_pos++;
849 } 858 }
850 if ((filp->f_pos > 1) && (filp->f_pos < INT_MAX)) { 859 mutex_lock(&sysfs_mutex);
851 mutex_lock(&sysfs_mutex); 860 for (pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos);
852 861 pos;
853 /* Skip the dentries we have already reported */ 862 pos = sysfs_dir_next_pos(parent_sd, filp->f_pos, pos)) {
854 pos = parent_sd->s_dir.children; 863 const char * name;
855 while (pos && (filp->f_pos > pos->s_ino)) 864 unsigned int type;
856 pos = pos->s_sibling; 865 int len, ret;
857 866
858 for ( ; pos; pos = pos->s_sibling) { 867 name = pos->s_name;
859 const char * name; 868 len = strlen(name);
860 int len; 869 ino = pos->s_ino;
861 870 type = dt_type(pos);
862 name = pos->s_name; 871 filp->f_pos = ino;
863 len = strlen(name); 872 filp->private_data = sysfs_get(pos);
864 filp->f_pos = ino = pos->s_ino;
865 873
866 if (filldir(dirent, name, len, filp->f_pos, ino,
867 dt_type(pos)) < 0)
868 break;
869 }
870 if (!pos)
871 filp->f_pos = INT_MAX;
872 mutex_unlock(&sysfs_mutex); 874 mutex_unlock(&sysfs_mutex);
875 ret = filldir(dirent, name, len, filp->f_pos, ino, type);
876 mutex_lock(&sysfs_mutex);
877 if (ret < 0)
878 break;
879 }
880 mutex_unlock(&sysfs_mutex);
881 if ((filp->f_pos > 1) && !pos) { /* EOF */
882 filp->f_pos = INT_MAX;
883 filp->private_data = NULL;
873 } 884 }
874 return 0; 885 return 0;
875} 886}
@@ -878,5 +889,6 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
878const struct file_operations sysfs_dir_operations = { 889const struct file_operations sysfs_dir_operations = {
879 .read = generic_read_dir, 890 .read = generic_read_dir,
880 .readdir = sysfs_readdir, 891 .readdir = sysfs_readdir,
892 .release = sysfs_dir_release,
881 .llseek = generic_file_llseek, 893 .llseek = generic_file_llseek,
882}; 894};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index dc30d9e31683..e222b2582746 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -53,7 +53,7 @@ struct sysfs_buffer {
53 size_t count; 53 size_t count;
54 loff_t pos; 54 loff_t pos;
55 char * page; 55 char * page;
56 struct sysfs_ops * ops; 56 const struct sysfs_ops * ops;
57 struct mutex mutex; 57 struct mutex mutex;
58 int needs_read_fill; 58 int needs_read_fill;
59 int event; 59 int event;
@@ -75,7 +75,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
75{ 75{
76 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 76 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
77 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 77 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
78 struct sysfs_ops * ops = buffer->ops; 78 const struct sysfs_ops * ops = buffer->ops;
79 int ret = 0; 79 int ret = 0;
80 ssize_t count; 80 ssize_t count;
81 81
@@ -85,13 +85,13 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
85 return -ENOMEM; 85 return -ENOMEM;
86 86
87 /* need attr_sd for attr and ops, its parent for kobj */ 87 /* need attr_sd for attr and ops, its parent for kobj */
88 if (!sysfs_get_active_two(attr_sd)) 88 if (!sysfs_get_active(attr_sd))
89 return -ENODEV; 89 return -ENODEV;
90 90
91 buffer->event = atomic_read(&attr_sd->s_attr.open->event); 91 buffer->event = atomic_read(&attr_sd->s_attr.open->event);
92 count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page); 92 count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page);
93 93
94 sysfs_put_active_two(attr_sd); 94 sysfs_put_active(attr_sd);
95 95
96 /* 96 /*
97 * The code works fine with PAGE_SIZE return but it's likely to 97 * The code works fine with PAGE_SIZE return but it's likely to
@@ -199,16 +199,16 @@ flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t
199{ 199{
200 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 200 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
201 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 201 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
202 struct sysfs_ops * ops = buffer->ops; 202 const struct sysfs_ops * ops = buffer->ops;
203 int rc; 203 int rc;
204 204
205 /* need attr_sd for attr and ops, its parent for kobj */ 205 /* need attr_sd for attr and ops, its parent for kobj */
206 if (!sysfs_get_active_two(attr_sd)) 206 if (!sysfs_get_active(attr_sd))
207 return -ENODEV; 207 return -ENODEV;
208 208
209 rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count); 209 rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count);
210 210
211 sysfs_put_active_two(attr_sd); 211 sysfs_put_active(attr_sd);
212 212
213 return rc; 213 return rc;
214} 214}
@@ -335,7 +335,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
335 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 335 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
336 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 336 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
337 struct sysfs_buffer *buffer; 337 struct sysfs_buffer *buffer;
338 struct sysfs_ops *ops; 338 const struct sysfs_ops *ops;
339 int error = -EACCES; 339 int error = -EACCES;
340 char *p; 340 char *p;
341 341
@@ -344,7 +344,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
344 memmove(last_sysfs_file, p, strlen(p) + 1); 344 memmove(last_sysfs_file, p, strlen(p) + 1);
345 345
346 /* need attr_sd for attr and ops, its parent for kobj */ 346 /* need attr_sd for attr and ops, its parent for kobj */
347 if (!sysfs_get_active_two(attr_sd)) 347 if (!sysfs_get_active(attr_sd))
348 return -ENODEV; 348 return -ENODEV;
349 349
350 /* every kobject with an attribute needs a ktype assigned */ 350 /* every kobject with an attribute needs a ktype assigned */
@@ -393,13 +393,13 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
393 goto err_free; 393 goto err_free;
394 394
395 /* open succeeded, put active references */ 395 /* open succeeded, put active references */
396 sysfs_put_active_two(attr_sd); 396 sysfs_put_active(attr_sd);
397 return 0; 397 return 0;
398 398
399 err_free: 399 err_free:
400 kfree(buffer); 400 kfree(buffer);
401 err_out: 401 err_out:
402 sysfs_put_active_two(attr_sd); 402 sysfs_put_active(attr_sd);
403 return error; 403 return error;
404} 404}
405 405
@@ -437,12 +437,12 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
437 struct sysfs_open_dirent *od = attr_sd->s_attr.open; 437 struct sysfs_open_dirent *od = attr_sd->s_attr.open;
438 438
439 /* need parent for the kobj, grab both */ 439 /* need parent for the kobj, grab both */
440 if (!sysfs_get_active_two(attr_sd)) 440 if (!sysfs_get_active(attr_sd))
441 goto trigger; 441 goto trigger;
442 442
443 poll_wait(filp, &od->poll, wait); 443 poll_wait(filp, &od->poll, wait);
444 444
445 sysfs_put_active_two(attr_sd); 445 sysfs_put_active(attr_sd);
446 446
447 if (buffer->event != atomic_read(&od->event)) 447 if (buffer->event != atomic_read(&od->event))
448 goto trigger; 448 goto trigger;
@@ -509,6 +509,7 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
509 if (!sd) 509 if (!sd)
510 return -ENOMEM; 510 return -ENOMEM;
511 sd->s_attr.attr = (void *)attr; 511 sd->s_attr.attr = (void *)attr;
512 sysfs_dirent_init_lockdep(sd);
512 513
513 sysfs_addrm_start(&acxt, dir_sd); 514 sysfs_addrm_start(&acxt, dir_sd);
514 rc = sysfs_add_one(&acxt, sd); 515 rc = sysfs_add_one(&acxt, sd);
@@ -542,6 +543,18 @@ int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
542 543
543} 544}
544 545
546int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
547{
548 int err = 0;
549 int i;
550
551 for (i = 0; ptr[i] && !err; i++)
552 err = sysfs_create_file(kobj, ptr[i]);
553 if (err)
554 while (--i >= 0)
555 sysfs_remove_file(kobj, ptr[i]);
556 return err;
557}
545 558
546/** 559/**
547 * sysfs_add_file_to_group - add an attribute file to a pre-existing group. 560 * sysfs_add_file_to_group - add an attribute file to a pre-existing group.
@@ -614,6 +627,12 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
614 sysfs_hash_and_remove(kobj->sd, attr->name); 627 sysfs_hash_and_remove(kobj->sd, attr->name);
615} 628}
616 629
630void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
631{
632 int i;
633 for (i = 0; ptr[i]; i++)
634 sysfs_remove_file(kobj, ptr[i]);
635}
617 636
618/** 637/**
619 * sysfs_remove_file_from_group - remove an attribute file from a group. 638 * sysfs_remove_file_from_group - remove an attribute file from a group.
@@ -732,3 +751,5 @@ EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
732 751
733EXPORT_SYMBOL_GPL(sysfs_create_file); 752EXPORT_SYMBOL_GPL(sysfs_create_file);
734EXPORT_SYMBOL_GPL(sysfs_remove_file); 753EXPORT_SYMBOL_GPL(sysfs_remove_file);
754EXPORT_SYMBOL_GPL(sysfs_remove_files);
755EXPORT_SYMBOL_GPL(sysfs_create_files);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 220b758523ae..a4a0a9419711 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,7 @@
18#include <linux/capability.h> 18#include <linux/capability.h>
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21#include <linux/xattr.h> 22#include <linux/xattr.h>
22#include <linux/security.h> 23#include <linux/security.h>
23#include "sysfs.h" 24#include "sysfs.h"
@@ -81,24 +82,23 @@ int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr)
81 if (!sd_attrs) 82 if (!sd_attrs)
82 return -ENOMEM; 83 return -ENOMEM;
83 sd->s_iattr = sd_attrs; 84 sd->s_iattr = sd_attrs;
84 } else { 85 }
85 /* attributes were changed at least once in past */ 86 /* attributes were changed at least once in past */
86 iattrs = &sd_attrs->ia_iattr; 87 iattrs = &sd_attrs->ia_iattr;
87 88
88 if (ia_valid & ATTR_UID) 89 if (ia_valid & ATTR_UID)
89 iattrs->ia_uid = iattr->ia_uid; 90 iattrs->ia_uid = iattr->ia_uid;
90 if (ia_valid & ATTR_GID) 91 if (ia_valid & ATTR_GID)
91 iattrs->ia_gid = iattr->ia_gid; 92 iattrs->ia_gid = iattr->ia_gid;
92 if (ia_valid & ATTR_ATIME) 93 if (ia_valid & ATTR_ATIME)
93 iattrs->ia_atime = iattr->ia_atime; 94 iattrs->ia_atime = iattr->ia_atime;
94 if (ia_valid & ATTR_MTIME) 95 if (ia_valid & ATTR_MTIME)
95 iattrs->ia_mtime = iattr->ia_mtime; 96 iattrs->ia_mtime = iattr->ia_mtime;
96 if (ia_valid & ATTR_CTIME) 97 if (ia_valid & ATTR_CTIME)
97 iattrs->ia_ctime = iattr->ia_ctime; 98 iattrs->ia_ctime = iattr->ia_ctime;
98 if (ia_valid & ATTR_MODE) { 99 if (ia_valid & ATTR_MODE) {
99 umode_t mode = iattr->ia_mode; 100 umode_t mode = iattr->ia_mode;
100 iattrs->ia_mode = sd->s_mode = mode; 101 iattrs->ia_mode = sd->s_mode = mode;
101 }
102 } 102 }
103 return 0; 103 return 0;
104} 104}
@@ -112,20 +112,20 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
112 if (!sd) 112 if (!sd)
113 return -EINVAL; 113 return -EINVAL;
114 114
115 mutex_lock(&sysfs_mutex);
115 error = inode_change_ok(inode, iattr); 116 error = inode_change_ok(inode, iattr);
116 if (error) 117 if (error)
117 return error; 118 goto out;
118 119
119 iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */ 120 iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */
120 121
121 error = inode_setattr(inode, iattr); 122 error = inode_setattr(inode, iattr);
122 if (error) 123 if (error)
123 return error; 124 goto out;
124 125
125 mutex_lock(&sysfs_mutex);
126 error = sysfs_sd_setattr(sd, iattr); 126 error = sysfs_sd_setattr(sd, iattr);
127out:
127 mutex_unlock(&sysfs_mutex); 128 mutex_unlock(&sysfs_mutex);
128
129 return error; 129 return error;
130} 130}
131 131
@@ -284,6 +284,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
284 284
285/** 285/**
286 * sysfs_get_inode - get inode for sysfs_dirent 286 * sysfs_get_inode - get inode for sysfs_dirent
287 * @sb: super block
287 * @sd: sysfs_dirent to allocate inode for 288 * @sd: sysfs_dirent to allocate inode for
288 * 289 *
289 * Get inode for @sd. If such inode doesn't exist, a new inode 290 * Get inode for @sd. If such inode doesn't exist, a new inode
@@ -296,11 +297,11 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
296 * RETURNS: 297 * RETURNS:
297 * Pointer to allocated inode on success, NULL on failure. 298 * Pointer to allocated inode on success, NULL on failure.
298 */ 299 */
299struct inode * sysfs_get_inode(struct sysfs_dirent *sd) 300struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
300{ 301{
301 struct inode *inode; 302 struct inode *inode;
302 303
303 inode = iget_locked(sysfs_sb, sd->s_ino); 304 inode = iget_locked(sb, sd->s_ino);
304 if (inode && (inode->i_state & I_NEW)) 305 if (inode && (inode->i_state & I_NEW))
305 sysfs_init_inode(sd, inode); 306 sysfs_init_inode(sd, inode);
306 307
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 49749955ccaf..776137828dca 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -18,12 +18,12 @@
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/magic.h> 20#include <linux/magic.h>
21#include <linux/slab.h>
21 22
22#include "sysfs.h" 23#include "sysfs.h"
23 24
24 25
25static struct vfsmount *sysfs_mount; 26static struct vfsmount *sysfs_mount;
26struct super_block * sysfs_sb = NULL;
27struct kmem_cache *sysfs_dir_cachep; 27struct kmem_cache *sysfs_dir_cachep;
28 28
29static const struct super_operations sysfs_ops = { 29static const struct super_operations sysfs_ops = {
@@ -50,11 +50,10 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
50 sb->s_magic = SYSFS_MAGIC; 50 sb->s_magic = SYSFS_MAGIC;
51 sb->s_op = &sysfs_ops; 51 sb->s_op = &sysfs_ops;
52 sb->s_time_gran = 1; 52 sb->s_time_gran = 1;
53 sysfs_sb = sb;
54 53
55 /* get root inode, initialize and unlock it */ 54 /* get root inode, initialize and unlock it */
56 mutex_lock(&sysfs_mutex); 55 mutex_lock(&sysfs_mutex);
57 inode = sysfs_get_inode(&sysfs_root); 56 inode = sysfs_get_inode(sb, &sysfs_root);
58 mutex_unlock(&sysfs_mutex); 57 mutex_unlock(&sysfs_mutex);
59 if (!inode) { 58 if (!inode) {
60 pr_debug("sysfs: could not get root inode\n"); 59 pr_debug("sysfs: could not get root inode\n");
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index c5eff49fa41b..b93ec51fa7ac 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,6 +11,7 @@
11 */ 11 */
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/gfp.h>
14#include <linux/mount.h> 15#include <linux/mount.h>
15#include <linux/module.h> 16#include <linux/module.h>
16#include <linux/kobject.h> 17#include <linux/kobject.h>
@@ -123,6 +124,44 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
123 sysfs_hash_and_remove(parent_sd, name); 124 sysfs_hash_and_remove(parent_sd, name);
124} 125}
125 126
127/**
128 * sysfs_rename_link - rename symlink in object's directory.
129 * @kobj: object we're acting for.
130 * @targ: object we're pointing to.
131 * @old: previous name of the symlink.
132 * @new: new name of the symlink.
133 *
134 * A helper function for the common rename symlink idiom.
135 */
136int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
137 const char *old, const char *new)
138{
139 struct sysfs_dirent *parent_sd, *sd = NULL;
140 int result;
141
142 if (!kobj)
143 parent_sd = &sysfs_root;
144 else
145 parent_sd = kobj->sd;
146
147 result = -ENOENT;
148 sd = sysfs_get_dirent(parent_sd, old);
149 if (!sd)
150 goto out;
151
152 result = -EINVAL;
153 if (sysfs_type(sd) != SYSFS_KOBJ_LINK)
154 goto out;
155 if (sd->s_symlink.target_sd->s_dir.kobj != targ)
156 goto out;
157
158 result = sysfs_rename(sd, parent_sd, new);
159
160out:
161 sysfs_put(sd);
162 return result;
163}
164
126static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, 165static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
127 struct sysfs_dirent *target_sd, char *path) 166 struct sysfs_dirent *target_sd, char *path)
128{ 167{
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index ca52e7b9d8f8..30f5a44fb5d3 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,6 +8,7 @@
8 * This file is released under the GPLv2. 8 * This file is released under the GPLv2.
9 */ 9 */
10 10
11#include <linux/lockdep.h>
11#include <linux/fs.h> 12#include <linux/fs.h>
12 13
13struct sysfs_open_dirent; 14struct sysfs_open_dirent;
@@ -50,6 +51,9 @@ struct sysfs_inode_attrs {
50struct sysfs_dirent { 51struct sysfs_dirent {
51 atomic_t s_count; 52 atomic_t s_count;
52 atomic_t s_active; 53 atomic_t s_active;
54#ifdef CONFIG_DEBUG_LOCK_ALLOC
55 struct lockdep_map dep_map;
56#endif
53 struct sysfs_dirent *s_parent; 57 struct sysfs_dirent *s_parent;
54 struct sysfs_dirent *s_sibling; 58 struct sysfs_dirent *s_sibling;
55 const char *s_name; 59 const char *s_name;
@@ -62,8 +66,8 @@ struct sysfs_dirent {
62 }; 66 };
63 67
64 unsigned int s_flags; 68 unsigned int s_flags;
69 unsigned short s_mode;
65 ino_t s_ino; 70 ino_t s_ino;
66 umode_t s_mode;
67 struct sysfs_inode_attrs *s_iattr; 71 struct sysfs_inode_attrs *s_iattr;
68}; 72};
69 73
@@ -75,6 +79,7 @@ struct sysfs_dirent {
75#define SYSFS_KOBJ_BIN_ATTR 0x0004 79#define SYSFS_KOBJ_BIN_ATTR 0x0004
76#define SYSFS_KOBJ_LINK 0x0008 80#define SYSFS_KOBJ_LINK 0x0008
77#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) 81#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK)
82#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
78 83
79#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK 84#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK
80#define SYSFS_FLAG_REMOVED 0x0200 85#define SYSFS_FLAG_REMOVED 0x0200
@@ -84,6 +89,20 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
84 return sd->s_flags & SYSFS_TYPE_MASK; 89 return sd->s_flags & SYSFS_TYPE_MASK;
85} 90}
86 91
92#ifdef CONFIG_DEBUG_LOCK_ALLOC
93#define sysfs_dirent_init_lockdep(sd) \
94do { \
95 struct attribute *attr = sd->s_attr.attr; \
96 struct lock_class_key *key = attr->key; \
97 if (!key) \
98 key = &attr->skey; \
99 \
100 lockdep_init_map(&sd->dep_map, "s_active", key, 0); \
101} while(0)
102#else
103#define sysfs_dirent_init_lockdep(sd) do {} while(0)
104#endif
105
87/* 106/*
88 * Context structure to be used while adding/removing nodes. 107 * Context structure to be used while adding/removing nodes.
89 */ 108 */
@@ -96,7 +115,6 @@ struct sysfs_addrm_cxt {
96 * mount.c 115 * mount.c
97 */ 116 */
98extern struct sysfs_dirent sysfs_root; 117extern struct sysfs_dirent sysfs_root;
99extern struct super_block *sysfs_sb;
100extern struct kmem_cache *sysfs_dir_cachep; 118extern struct kmem_cache *sysfs_dir_cachep;
101 119
102/* 120/*
@@ -109,8 +127,8 @@ extern const struct file_operations sysfs_dir_operations;
109extern const struct inode_operations sysfs_dir_inode_operations; 127extern const struct inode_operations sysfs_dir_inode_operations;
110 128
111struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd); 129struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
112struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd); 130struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
113void sysfs_put_active_two(struct sysfs_dirent *sd); 131void sysfs_put_active(struct sysfs_dirent *sd);
114void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, 132void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
115 struct sysfs_dirent *parent_sd); 133 struct sysfs_dirent *parent_sd);
116int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); 134int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
@@ -153,7 +171,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
153/* 171/*
154 * inode.c 172 * inode.c
155 */ 173 */
156struct inode *sysfs_get_inode(struct sysfs_dirent *sd); 174struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
157void sysfs_delete_inode(struct inode *inode); 175void sysfs_delete_inode(struct inode *inode);
158int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); 176int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
159int sysfs_permission(struct inode *inode, int mask); 177int sysfs_permission(struct inode *inode, int mask);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 9824743832a7..4573734d723d 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -26,6 +26,7 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/vfs.h> 28#include <linux/vfs.h>
29#include <linux/writeback.h>
29#include <linux/namei.h> 30#include <linux/namei.h>
30#include <asm/byteorder.h> 31#include <asm/byteorder.h>
31#include "sysv.h" 32#include "sysv.h"
@@ -246,7 +247,7 @@ bad_inode:
246 return ERR_PTR(-EIO); 247 return ERR_PTR(-EIO);
247} 248}
248 249
249int sysv_write_inode(struct inode *inode, int wait) 250static int __sysv_write_inode(struct inode *inode, int wait)
250{ 251{
251 struct super_block * sb = inode->i_sb; 252 struct super_block * sb = inode->i_sb;
252 struct sysv_sb_info * sbi = SYSV_SB(sb); 253 struct sysv_sb_info * sbi = SYSV_SB(sb);
@@ -296,9 +297,14 @@ int sysv_write_inode(struct inode *inode, int wait)
296 return 0; 297 return 0;
297} 298}
298 299
300int sysv_write_inode(struct inode *inode, struct writeback_control *wbc)
301{
302 return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
303}
304
299int sysv_sync_inode(struct inode *inode) 305int sysv_sync_inode(struct inode *inode)
300{ 306{
301 return sysv_write_inode(inode, 1); 307 return __sysv_write_inode(inode, 1);
302} 308}
303 309
304static void sysv_delete_inode(struct inode *inode) 310static void sysv_delete_inode(struct inode *inode)
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 53786eb5cf60..94cb9b4d76c2 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -142,7 +142,7 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
142 142
143/* inode.c */ 143/* inode.c */
144extern struct inode *sysv_iget(struct super_block *, unsigned int); 144extern struct inode *sysv_iget(struct super_block *, unsigned int);
145extern int sysv_write_inode(struct inode *, int); 145extern int sysv_write_inode(struct inode *, struct writeback_control *wbc);
146extern int sysv_sync_inode(struct inode *); 146extern int sysv_sync_inode(struct inode *);
147extern void sysv_set_inode(struct inode *, dev_t); 147extern void sysv_set_inode(struct inode *, dev_t);
148extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *); 148extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 1bfc95ad5f71..98158de91d24 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -14,6 +14,7 @@
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h>
17#include <linux/list.h> 18#include <linux/list.h>
18#include <linux/spinlock.h> 19#include <linux/spinlock.h>
19#include <linux/time.h> 20#include <linux/time.h>
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 4775af401167..37fa7ed062d8 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -45,6 +45,7 @@
45 45
46#include <linux/freezer.h> 46#include <linux/freezer.h>
47#include <linux/kthread.h> 47#include <linux/kthread.h>
48#include <linux/slab.h>
48#include "ubifs.h" 49#include "ubifs.h"
49 50
50/** 51/**
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 90492327b383..c2a68baa782f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,6 +34,7 @@
34#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
35#include <linux/debugfs.h> 35#include <linux/debugfs.h>
36#include <linux/math64.h> 36#include <linux/math64.h>
37#include <linux/slab.h>
37 38
38#ifdef CONFIG_UBIFS_FS_DEBUG 39#ifdef CONFIG_UBIFS_FS_DEBUG
39 40
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 552fb0111fff..401e503d44a1 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1120,7 +1120,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
1120 if (release) 1120 if (release)
1121 ubifs_release_budget(c, &ino_req); 1121 ubifs_release_budget(c, &ino_req);
1122 if (IS_SYNC(old_inode)) 1122 if (IS_SYNC(old_inode))
1123 err = old_inode->i_sb->s_op->write_inode(old_inode, 1); 1123 err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
1124 return err; 1124 return err;
1125 1125
1126out_cancel: 1126out_cancel:
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 16a6444330ec..5692cf72b807 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -52,6 +52,7 @@
52#include "ubifs.h" 52#include "ubifs.h"
53#include <linux/mount.h> 53#include <linux/mount.h>
54#include <linux/namei.h> 54#include <linux/namei.h>
55#include <linux/slab.h>
55 56
56static int read_block(struct inode *inode, void *addr, unsigned int block, 57static int read_block(struct inode *inode, void *addr, unsigned int block,
57 struct ubifs_data_node *dn) 58 struct ubifs_data_node *dn)
@@ -1011,7 +1012,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
1011 /* Is the page fully inside @i_size? */ 1012 /* Is the page fully inside @i_size? */
1012 if (page->index < end_index) { 1013 if (page->index < end_index) {
1013 if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) { 1014 if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
1014 err = inode->i_sb->s_op->write_inode(inode, 1); 1015 err = inode->i_sb->s_op->write_inode(inode, NULL);
1015 if (err) 1016 if (err)
1016 goto out_unlock; 1017 goto out_unlock;
1017 /* 1018 /*
@@ -1039,7 +1040,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
1039 kunmap_atomic(kaddr, KM_USER0); 1040 kunmap_atomic(kaddr, KM_USER0);
1040 1041
1041 if (i_size > synced_i_size) { 1042 if (i_size > synced_i_size) {
1042 err = inode->i_sb->s_op->write_inode(inode, 1); 1043 err = inode->i_sb->s_op->write_inode(inode, NULL);
1043 if (err) 1044 if (err)
1044 goto out_unlock; 1045 goto out_unlock;
1045 } 1046 }
@@ -1242,7 +1243,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
1242 if (release) 1243 if (release)
1243 ubifs_release_budget(c, &req); 1244 ubifs_release_budget(c, &req);
1244 if (IS_SYNC(inode)) 1245 if (IS_SYNC(inode))
1245 err = inode->i_sb->s_op->write_inode(inode, 1); 1246 err = inode->i_sb->s_op->write_inode(inode, NULL);
1246 return err; 1247 return err;
1247 1248
1248out: 1249out:
@@ -1316,7 +1317,7 @@ int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1316 * the inode unless this is a 'datasync()' call. 1317 * the inode unless this is a 'datasync()' call.
1317 */ 1318 */
1318 if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { 1319 if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
1319 err = inode->i_sb->s_op->write_inode(inode, 1); 1320 err = inode->i_sb->s_op->write_inode(inode, NULL);
1320 if (err) 1321 if (err)
1321 return err; 1322 return err;
1322 } 1323 }
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 618c2701d3a7..918d1582ca05 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -53,7 +53,9 @@
53 * good, and GC takes extra care when moving them. 53 * good, and GC takes extra care when moving them.
54 */ 54 */
55 55
56#include <linux/slab.h>
56#include <linux/pagemap.h> 57#include <linux/pagemap.h>
58#include <linux/list_sort.h>
57#include "ubifs.h" 59#include "ubifs.h"
58 60
59/* 61/*
@@ -108,101 +110,6 @@ static int switch_gc_head(struct ubifs_info *c)
108} 110}
109 111
110/** 112/**
111 * list_sort - sort a list.
112 * @priv: private data, passed to @cmp
113 * @head: the list to sort
114 * @cmp: the elements comparison function
115 *
116 * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
117 * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
118 * in ascending order.
119 *
120 * The comparison function @cmp is supposed to return a negative value if @a is
121 * than @b, and a positive value if @a is greater than @b. If @a and @b are
122 * equivalent, then it does not matter what this function returns.
123 */
124static void list_sort(void *priv, struct list_head *head,
125 int (*cmp)(void *priv, struct list_head *a,
126 struct list_head *b))
127{
128 struct list_head *p, *q, *e, *list, *tail, *oldhead;
129 int insize, nmerges, psize, qsize, i;
130
131 if (list_empty(head))
132 return;
133
134 list = head->next;
135 list_del(head);
136 insize = 1;
137 for (;;) {
138 p = oldhead = list;
139 list = tail = NULL;
140 nmerges = 0;
141
142 while (p) {
143 nmerges++;
144 q = p;
145 psize = 0;
146 for (i = 0; i < insize; i++) {
147 psize++;
148 q = q->next == oldhead ? NULL : q->next;
149 if (!q)
150 break;
151 }
152
153 qsize = insize;
154 while (psize > 0 || (qsize > 0 && q)) {
155 if (!psize) {
156 e = q;
157 q = q->next;
158 qsize--;
159 if (q == oldhead)
160 q = NULL;
161 } else if (!qsize || !q) {
162 e = p;
163 p = p->next;
164 psize--;
165 if (p == oldhead)
166 p = NULL;
167 } else if (cmp(priv, p, q) <= 0) {
168 e = p;
169 p = p->next;
170 psize--;
171 if (p == oldhead)
172 p = NULL;
173 } else {
174 e = q;
175 q = q->next;
176 qsize--;
177 if (q == oldhead)
178 q = NULL;
179 }
180 if (tail)
181 tail->next = e;
182 else
183 list = e;
184 e->prev = tail;
185 tail = e;
186 }
187 p = q;
188 }
189
190 tail->next = list;
191 list->prev = tail;
192
193 if (nmerges <= 1)
194 break;
195
196 insize *= 2;
197 }
198
199 head->next = list;
200 head->prev = list->prev;
201 list->prev->next = head;
202 list->prev = head;
203}
204
205/**
206 * data_nodes_cmp - compare 2 data nodes. 113 * data_nodes_cmp - compare 2 data nodes.
207 * @priv: UBIFS file-system description object 114 * @priv: UBIFS file-system description object
208 * @a: first data node 115 * @a: first data node
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index e589fedaf1ef..77d5cf4a7547 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -51,6 +51,7 @@
51 */ 51 */
52 52
53#include <linux/crc32.h> 53#include <linux/crc32.h>
54#include <linux/slab.h>
54#include "ubifs.h" 55#include "ubifs.h"
55 56
56/** 57/**
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index b2792e84d245..ad7f67b827ea 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -46,6 +46,7 @@
46#include "ubifs.h" 46#include "ubifs.h"
47#include <linux/crc16.h> 47#include <linux/crc16.h>
48#include <linux/math64.h> 48#include <linux/math64.h>
49#include <linux/slab.h>
49 50
50/** 51/**
51 * do_calc_lpt_geom - calculate sizes for the LPT area. 52 * do_calc_lpt_geom - calculate sizes for the LPT area.
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 8cbfb8248025..13cb7a4237bf 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -26,6 +26,7 @@
26 */ 26 */
27 27
28#include <linux/crc16.h> 28#include <linux/crc16.h>
29#include <linux/slab.h>
29#include "ubifs.h" 30#include "ubifs.h"
30 31
31/** 32/**
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 868a55ee080f..109c6ea03bb5 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/slab.h>
34#include "ubifs.h" 35#include "ubifs.h"
35 36
36/** 37/**
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 57085e43320f..96cb62c8a9dd 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -27,6 +27,7 @@
27 */ 27 */
28 28
29#include "ubifs.h" 29#include "ubifs.h"
30#include <linux/slab.h>
30#include <linux/random.h> 31#include <linux/random.h>
31#include <linux/math64.h> 32#include <linux/math64.h>
32 33
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 43f9d19a6f33..4d2f2157dd3f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -283,7 +283,7 @@ static void ubifs_destroy_inode(struct inode *inode)
283/* 283/*
284 * Note, Linux write-back code calls this without 'i_mutex'. 284 * Note, Linux write-back code calls this without 'i_mutex'.
285 */ 285 */
286static int ubifs_write_inode(struct inode *inode, int wait) 286static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
287{ 287{
288 int err = 0; 288 int err = 0;
289 struct ubifs_info *c = inode->i_sb->s_fs_info; 289 struct ubifs_info *c = inode->i_sb->s_fs_info;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e5b1a7d00fa0..2194915220e5 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <linux/crc32.h> 33#include <linux/crc32.h>
34#include <linux/slab.h>
34#include "ubifs.h" 35#include "ubifs.h"
35 36
36/* 37/*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index b2d976366a46..bd2542dad014 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -28,6 +28,7 @@
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/err.h> 29#include <linux/err.h>
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/slab.h>
31#include <linux/vmalloc.h> 32#include <linux/vmalloc.h>
32#include <linux/spinlock.h> 33#include <linux/spinlock.h>
33#include <linux/mutex.h> 34#include <linux/mutex.h>
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 195830f47569..c74400f88fe0 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -56,6 +56,7 @@
56 */ 56 */
57 57
58#include "ubifs.h" 58#include "ubifs.h"
59#include <linux/slab.h>
59#include <linux/xattr.h> 60#include <linux/xattr.h>
60#include <linux/posix_acl_xattr.h> 61#include <linux/posix_acl_xattr.h>
61 62
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 82372e332f08..9a9378b4eb5a 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -31,55 +31,8 @@
31#define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr) 31#define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
32#define udf_set_bit(nr, addr) ext2_set_bit(nr, addr) 32#define udf_set_bit(nr, addr) ext2_set_bit(nr, addr)
33#define udf_test_bit(nr, addr) ext2_test_bit(nr, addr) 33#define udf_test_bit(nr, addr) ext2_test_bit(nr, addr)
34#define udf_find_first_one_bit(addr, size) find_first_one_bit(addr, size)
35#define udf_find_next_one_bit(addr, size, offset) \ 34#define udf_find_next_one_bit(addr, size, offset) \
36 find_next_one_bit(addr, size, offset) 35 ext2_find_next_bit(addr, size, offset)
37
38#define leBPL_to_cpup(x) leNUM_to_cpup(BITS_PER_LONG, x)
39#define leNUM_to_cpup(x, y) xleNUM_to_cpup(x, y)
40#define xleNUM_to_cpup(x, y) (le ## x ## _to_cpup(y))
41#define uintBPL_t uint(BITS_PER_LONG)
42#define uint(x) xuint(x)
43#define xuint(x) __le ## x
44
45static inline int find_next_one_bit(void *addr, int size, int offset)
46{
47 uintBPL_t *p = ((uintBPL_t *) addr) + (offset / BITS_PER_LONG);
48 int result = offset & ~(BITS_PER_LONG - 1);
49 unsigned long tmp;
50
51 if (offset >= size)
52 return size;
53 size -= result;
54 offset &= (BITS_PER_LONG - 1);
55 if (offset) {
56 tmp = leBPL_to_cpup(p++);
57 tmp &= ~0UL << offset;
58 if (size < BITS_PER_LONG)
59 goto found_first;
60 if (tmp)
61 goto found_middle;
62 size -= BITS_PER_LONG;
63 result += BITS_PER_LONG;
64 }
65 while (size & ~(BITS_PER_LONG - 1)) {
66 tmp = leBPL_to_cpup(p++);
67 if (tmp)
68 goto found_middle;
69 result += BITS_PER_LONG;
70 size -= BITS_PER_LONG;
71 }
72 if (!size)
73 return result;
74 tmp = leBPL_to_cpup(p);
75found_first:
76 tmp &= ~0UL >> (BITS_PER_LONG - size);
77found_middle:
78 return result + ffz(~tmp);
79}
80
81#define find_first_one_bit(addr, size)\
82 find_next_one_bit((addr), (size), 0)
83 36
84static int read_block_bitmap(struct super_block *sb, 37static int read_block_bitmap(struct super_block *sb,
85 struct udf_bitmap *bitmap, unsigned int block, 38 struct udf_bitmap *bitmap, unsigned int block,
@@ -172,9 +125,8 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
172 125
173 mutex_lock(&sbi->s_alloc_mutex); 126 mutex_lock(&sbi->s_alloc_mutex);
174 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; 127 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
175 if (bloc->logicalBlockNum < 0 || 128 if (bloc->logicalBlockNum + count < count ||
176 (bloc->logicalBlockNum + count) > 129 (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
177 partmap->s_partition_len) {
178 udf_debug("%d < %d || %d + %d > %d\n", 130 udf_debug("%d < %d || %d + %d > %d\n",
179 bloc->logicalBlockNum, 0, bloc->logicalBlockNum, 131 bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
180 count, partmap->s_partition_len); 132 count, partmap->s_partition_len);
@@ -208,7 +160,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
208 ((char *)bh->b_data)[(bit + i) >> 3]); 160 ((char *)bh->b_data)[(bit + i) >> 3]);
209 } else { 161 } else {
210 if (inode) 162 if (inode)
211 vfs_dq_free_block(inode, 1); 163 dquot_free_block(inode, 1);
212 udf_add_free_space(sb, sbi->s_partition, 1); 164 udf_add_free_space(sb, sbi->s_partition, 1);
213 } 165 }
214 } 166 }
@@ -260,11 +212,11 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
260 while (bit < (sb->s_blocksize << 3) && block_count > 0) { 212 while (bit < (sb->s_blocksize << 3) && block_count > 0) {
261 if (!udf_test_bit(bit, bh->b_data)) 213 if (!udf_test_bit(bit, bh->b_data))
262 goto out; 214 goto out;
263 else if (vfs_dq_prealloc_block(inode, 1)) 215 else if (dquot_prealloc_block(inode, 1))
264 goto out; 216 goto out;
265 else if (!udf_clear_bit(bit, bh->b_data)) { 217 else if (!udf_clear_bit(bit, bh->b_data)) {
266 udf_debug("bit already cleared for block %d\n", bit); 218 udf_debug("bit already cleared for block %d\n", bit);
267 vfs_dq_free_block(inode, 1); 219 dquot_free_block(inode, 1);
268 goto out; 220 goto out;
269 } 221 }
270 block_count--; 222 block_count--;
@@ -390,10 +342,14 @@ got_block:
390 /* 342 /*
391 * Check quota for allocation of this block. 343 * Check quota for allocation of this block.
392 */ 344 */
393 if (inode && vfs_dq_alloc_block(inode, 1)) { 345 if (inode) {
394 mutex_unlock(&sbi->s_alloc_mutex); 346 int ret = dquot_alloc_block(inode, 1);
395 *err = -EDQUOT; 347
396 return 0; 348 if (ret) {
349 mutex_unlock(&sbi->s_alloc_mutex);
350 *err = ret;
351 return 0;
352 }
397 } 353 }
398 354
399 newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) - 355 newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
@@ -436,9 +392,8 @@ static void udf_table_free_blocks(struct super_block *sb,
436 392
437 mutex_lock(&sbi->s_alloc_mutex); 393 mutex_lock(&sbi->s_alloc_mutex);
438 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; 394 partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
439 if (bloc->logicalBlockNum < 0 || 395 if (bloc->logicalBlockNum + count < count ||
440 (bloc->logicalBlockNum + count) > 396 (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
441 partmap->s_partition_len) {
442 udf_debug("%d < %d || %d + %d > %d\n", 397 udf_debug("%d < %d || %d + %d > %d\n",
443 bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count, 398 bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count,
444 partmap->s_partition_len); 399 partmap->s_partition_len);
@@ -449,7 +404,7 @@ static void udf_table_free_blocks(struct super_block *sb,
449 /* We do this up front - There are some error conditions that 404 /* We do this up front - There are some error conditions that
450 could occure, but.. oh well */ 405 could occure, but.. oh well */
451 if (inode) 406 if (inode)
452 vfs_dq_free_block(inode, count); 407 dquot_free_block(inode, count);
453 udf_add_free_space(sb, sbi->s_partition, count); 408 udf_add_free_space(sb, sbi->s_partition, count);
454 409
455 start = bloc->logicalBlockNum + offset; 410 start = bloc->logicalBlockNum + offset;
@@ -547,7 +502,7 @@ static void udf_table_free_blocks(struct super_block *sb,
547 } 502 }
548 503
549 if (epos.offset + (2 * adsize) > sb->s_blocksize) { 504 if (epos.offset + (2 * adsize) > sb->s_blocksize) {
550 char *sptr, *dptr; 505 unsigned char *sptr, *dptr;
551 int loffset; 506 int loffset;
552 507
553 brelse(oepos.bh); 508 brelse(oepos.bh);
@@ -694,7 +649,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
694 epos.offset -= adsize; 649 epos.offset -= adsize;
695 650
696 alloc_count = (elen >> sb->s_blocksize_bits); 651 alloc_count = (elen >> sb->s_blocksize_bits);
697 if (inode && vfs_dq_prealloc_block(inode, 652 if (inode && dquot_prealloc_block(inode,
698 alloc_count > block_count ? block_count : alloc_count)) 653 alloc_count > block_count ? block_count : alloc_count))
699 alloc_count = 0; 654 alloc_count = 0;
700 else if (alloc_count > block_count) { 655 else if (alloc_count > block_count) {
@@ -797,12 +752,13 @@ static int udf_table_new_block(struct super_block *sb,
797 newblock = goal_eloc.logicalBlockNum; 752 newblock = goal_eloc.logicalBlockNum;
798 goal_eloc.logicalBlockNum++; 753 goal_eloc.logicalBlockNum++;
799 goal_elen -= sb->s_blocksize; 754 goal_elen -= sb->s_blocksize;
800 755 if (inode) {
801 if (inode && vfs_dq_alloc_block(inode, 1)) { 756 *err = dquot_alloc_block(inode, 1);
802 brelse(goal_epos.bh); 757 if (*err) {
803 mutex_unlock(&sbi->s_alloc_mutex); 758 brelse(goal_epos.bh);
804 *err = -EDQUOT; 759 mutex_unlock(&sbi->s_alloc_mutex);
805 return 0; 760 return 0;
761 }
806 } 762 }
807 763
808 if (goal_elen) 764 if (goal_elen)
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 61d9a76a3a69..f0f2a436251e 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -45,8 +45,8 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
45 int block, iblock; 45 int block, iblock;
46 loff_t nf_pos = (filp->f_pos - 1) << 2; 46 loff_t nf_pos = (filp->f_pos - 1) << 2;
47 int flen; 47 int flen;
48 char *fname = NULL; 48 unsigned char *fname = NULL;
49 char *nameptr; 49 unsigned char *nameptr;
50 uint16_t liu; 50 uint16_t liu;
51 uint8_t lfi; 51 uint8_t lfi;
52 loff_t size = udf_ext0_offset(dir) + dir->i_size; 52 loff_t size = udf_ext0_offset(dir) + dir->i_size;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index f311d509b6a3..4b6a46ccbf46 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,6 +34,7 @@
34#include <linux/errno.h> 34#include <linux/errno.h>
35#include <linux/smp_lock.h> 35#include <linux/smp_lock.h>
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/quotaops.h>
37#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
38#include <linux/aio.h> 39#include <linux/aio.h>
39 40
@@ -207,7 +208,7 @@ const struct file_operations udf_file_operations = {
207 .read = do_sync_read, 208 .read = do_sync_read,
208 .aio_read = generic_file_aio_read, 209 .aio_read = generic_file_aio_read,
209 .ioctl = udf_ioctl, 210 .ioctl = udf_ioctl,
210 .open = generic_file_open, 211 .open = dquot_file_open,
211 .mmap = generic_file_mmap, 212 .mmap = generic_file_mmap,
212 .write = do_sync_write, 213 .write = do_sync_write,
213 .aio_write = udf_file_aio_write, 214 .aio_write = udf_file_aio_write,
@@ -217,6 +218,29 @@ const struct file_operations udf_file_operations = {
217 .llseek = generic_file_llseek, 218 .llseek = generic_file_llseek,
218}; 219};
219 220
221int udf_setattr(struct dentry *dentry, struct iattr *iattr)
222{
223 struct inode *inode = dentry->d_inode;
224 int error;
225
226 error = inode_change_ok(inode, iattr);
227 if (error)
228 return error;
229
230 if (iattr->ia_valid & ATTR_SIZE)
231 dquot_initialize(inode);
232
233 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
234 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
235 error = dquot_transfer(inode, iattr);
236 if (error)
237 return error;
238 }
239
240 return inode_setattr(inode, iattr);
241}
242
220const struct inode_operations udf_file_inode_operations = { 243const struct inode_operations udf_file_inode_operations = {
221 .truncate = udf_truncate, 244 .truncate = udf_truncate,
245 .setattr = udf_setattr,
222}; 246};
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index c10fa39f97e2..fb68c9cd0c3e 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -36,8 +36,8 @@ void udf_free_inode(struct inode *inode)
36 * Note: we must free any quota before locking the superblock, 36 * Note: we must free any quota before locking the superblock,
37 * as writing the quota to disk may need the lock as well. 37 * as writing the quota to disk may need the lock as well.
38 */ 38 */
39 vfs_dq_free_inode(inode); 39 dquot_free_inode(inode);
40 vfs_dq_drop(inode); 40 dquot_drop(inode);
41 41
42 clear_inode(inode); 42 clear_inode(inode);
43 43
@@ -61,7 +61,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
61 struct super_block *sb = dir->i_sb; 61 struct super_block *sb = dir->i_sb;
62 struct udf_sb_info *sbi = UDF_SB(sb); 62 struct udf_sb_info *sbi = UDF_SB(sb);
63 struct inode *inode; 63 struct inode *inode;
64 int block; 64 int block, ret;
65 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; 65 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
66 struct udf_inode_info *iinfo; 66 struct udf_inode_info *iinfo;
67 struct udf_inode_info *dinfo = UDF_I(dir); 67 struct udf_inode_info *dinfo = UDF_I(dir);
@@ -153,12 +153,14 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
153 insert_inode_hash(inode); 153 insert_inode_hash(inode);
154 mark_inode_dirty(inode); 154 mark_inode_dirty(inode);
155 155
156 if (vfs_dq_alloc_inode(inode)) { 156 dquot_initialize(inode);
157 vfs_dq_drop(inode); 157 ret = dquot_alloc_inode(inode);
158 if (ret) {
159 dquot_drop(inode);
158 inode->i_flags |= S_NOQUOTA; 160 inode->i_flags |= S_NOQUOTA;
159 inode->i_nlink = 0; 161 inode->i_nlink = 0;
160 iput(inode); 162 iput(inode);
161 *err = -EDQUOT; 163 *err = ret;
162 return NULL; 164 return NULL;
163 } 165 }
164 166
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index f90231eb2916..8a3fbd177cab 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -36,6 +36,7 @@
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/quotaops.h>
39#include <linux/slab.h> 40#include <linux/slab.h>
40#include <linux/crc-itu-t.h> 41#include <linux/crc-itu-t.h>
41 42
@@ -70,6 +71,9 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
70 71
71void udf_delete_inode(struct inode *inode) 72void udf_delete_inode(struct inode *inode)
72{ 73{
74 if (!is_bad_inode(inode))
75 dquot_initialize(inode);
76
73 truncate_inode_pages(&inode->i_data, 0); 77 truncate_inode_pages(&inode->i_data, 0);
74 78
75 if (is_bad_inode(inode)) 79 if (is_bad_inode(inode))
@@ -102,12 +106,14 @@ void udf_clear_inode(struct inode *inode)
102 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && 106 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
103 inode->i_size != iinfo->i_lenExtents) { 107 inode->i_size != iinfo->i_lenExtents) {
104 printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has " 108 printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
105 "inode size %llu different from extent lenght %llu. " 109 "inode size %llu different from extent length %llu. "
106 "Filesystem need not be standards compliant.\n", 110 "Filesystem need not be standards compliant.\n",
107 inode->i_sb->s_id, inode->i_ino, inode->i_mode, 111 inode->i_sb->s_id, inode->i_ino, inode->i_mode,
108 (unsigned long long)inode->i_size, 112 (unsigned long long)inode->i_size,
109 (unsigned long long)iinfo->i_lenExtents); 113 (unsigned long long)iinfo->i_lenExtents);
110 } 114 }
115
116 dquot_drop(inode);
111 kfree(iinfo->i_ext.i_data); 117 kfree(iinfo->i_ext.i_data);
112 iinfo->i_ext.i_data = NULL; 118 iinfo->i_ext.i_data = NULL;
113} 119}
@@ -1308,7 +1314,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1308 break; 1314 break;
1309 case ICBTAG_FILE_TYPE_SYMLINK: 1315 case ICBTAG_FILE_TYPE_SYMLINK:
1310 inode->i_data.a_ops = &udf_symlink_aops; 1316 inode->i_data.a_ops = &udf_symlink_aops;
1311 inode->i_op = &page_symlink_inode_operations; 1317 inode->i_op = &udf_symlink_inode_operations;
1312 inode->i_mode = S_IFLNK | S_IRWXUGO; 1318 inode->i_mode = S_IFLNK | S_IRWXUGO;
1313 break; 1319 break;
1314 case ICBTAG_FILE_TYPE_MAIN: 1320 case ICBTAG_FILE_TYPE_MAIN:
@@ -1373,12 +1379,12 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
1373 return mode; 1379 return mode;
1374} 1380}
1375 1381
1376int udf_write_inode(struct inode *inode, int sync) 1382int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
1377{ 1383{
1378 int ret; 1384 int ret;
1379 1385
1380 lock_kernel(); 1386 lock_kernel();
1381 ret = udf_update_inode(inode, sync); 1387 ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
1382 unlock_kernel(); 1388 unlock_kernel();
1383 1389
1384 return ret; 1390 return ret;
@@ -1402,20 +1408,19 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1402 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 1408 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
1403 struct udf_inode_info *iinfo = UDF_I(inode); 1409 struct udf_inode_info *iinfo = UDF_I(inode);
1404 1410
1405 bh = udf_tread(inode->i_sb, 1411 bh = udf_tgetblk(inode->i_sb,
1406 udf_get_lb_pblock(inode->i_sb, 1412 udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0));
1407 &iinfo->i_location, 0));
1408 if (!bh) { 1413 if (!bh) {
1409 udf_debug("bread failure\n"); 1414 udf_debug("getblk failure\n");
1410 return -EIO; 1415 return -ENOMEM;
1411 } 1416 }
1412 1417
1413 memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); 1418 lock_buffer(bh);
1414 1419 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1415 fe = (struct fileEntry *)bh->b_data; 1420 fe = (struct fileEntry *)bh->b_data;
1416 efe = (struct extendedFileEntry *)bh->b_data; 1421 efe = (struct extendedFileEntry *)bh->b_data;
1417 1422
1418 if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) { 1423 if (iinfo->i_use) {
1419 struct unallocSpaceEntry *use = 1424 struct unallocSpaceEntry *use =
1420 (struct unallocSpaceEntry *)bh->b_data; 1425 (struct unallocSpaceEntry *)bh->b_data;
1421 1426
@@ -1423,20 +1428,18 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1423 memcpy(bh->b_data + sizeof(struct unallocSpaceEntry), 1428 memcpy(bh->b_data + sizeof(struct unallocSpaceEntry),
1424 iinfo->i_ext.i_data, inode->i_sb->s_blocksize - 1429 iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
1425 sizeof(struct unallocSpaceEntry)); 1430 sizeof(struct unallocSpaceEntry));
1431 use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE);
1432 use->descTag.tagLocation =
1433 cpu_to_le32(iinfo->i_location.logicalBlockNum);
1426 crclen = sizeof(struct unallocSpaceEntry) + 1434 crclen = sizeof(struct unallocSpaceEntry) +
1427 iinfo->i_lenAlloc - sizeof(struct tag); 1435 iinfo->i_lenAlloc - sizeof(struct tag);
1428 use->descTag.tagLocation = cpu_to_le32(
1429 iinfo->i_location.
1430 logicalBlockNum);
1431 use->descTag.descCRCLength = cpu_to_le16(crclen); 1436 use->descTag.descCRCLength = cpu_to_le16(crclen);
1432 use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + 1437 use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
1433 sizeof(struct tag), 1438 sizeof(struct tag),
1434 crclen)); 1439 crclen));
1435 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); 1440 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
1436 1441
1437 mark_buffer_dirty(bh); 1442 goto out;
1438 brelse(bh);
1439 return err;
1440 } 1443 }
1441 1444
1442 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) 1445 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
@@ -1591,18 +1594,21 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1591 fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number); 1594 fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number);
1592 fe->descTag.tagLocation = cpu_to_le32( 1595 fe->descTag.tagLocation = cpu_to_le32(
1593 iinfo->i_location.logicalBlockNum); 1596 iinfo->i_location.logicalBlockNum);
1594 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - 1597 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag);
1595 sizeof(struct tag);
1596 fe->descTag.descCRCLength = cpu_to_le16(crclen); 1598 fe->descTag.descCRCLength = cpu_to_le16(crclen);
1597 fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag), 1599 fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
1598 crclen)); 1600 crclen));
1599 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); 1601 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
1600 1602
1603out:
1604 set_buffer_uptodate(bh);
1605 unlock_buffer(bh);
1606
1601 /* write the data blocks */ 1607 /* write the data blocks */
1602 mark_buffer_dirty(bh); 1608 mark_buffer_dirty(bh);
1603 if (do_sync) { 1609 if (do_sync) {
1604 sync_dirty_buffer(bh); 1610 sync_dirty_buffer(bh);
1605 if (buffer_req(bh) && !buffer_uptodate(bh)) { 1611 if (buffer_write_io_error(bh)) {
1606 printk(KERN_WARNING "IO error syncing udf inode " 1612 printk(KERN_WARNING "IO error syncing udf inode "
1607 "[%s:%08lx]\n", inode->i_sb->s_id, 1613 "[%s:%08lx]\n", inode->i_sb->s_id,
1608 inode->i_ino); 1614 inode->i_ino);
@@ -1672,7 +1678,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1672 return -1; 1678 return -1;
1673 1679
1674 if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) { 1680 if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
1675 char *sptr, *dptr; 1681 unsigned char *sptr, *dptr;
1676 struct buffer_head *nbh; 1682 struct buffer_head *nbh;
1677 int err, loffset; 1683 int err, loffset;
1678 struct kernel_lb_addr obloc = epos->block; 1684 struct kernel_lb_addr obloc = epos->block;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index cd2115060fdc..75816025f95f 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -34,8 +34,8 @@
34#include <linux/crc-itu-t.h> 34#include <linux/crc-itu-t.h>
35#include <linux/exportfs.h> 35#include <linux/exportfs.h>
36 36
37static inline int udf_match(int len1, const char *name1, int len2, 37static inline int udf_match(int len1, const unsigned char *name1, int len2,
38 const char *name2) 38 const unsigned char *name2)
39{ 39{
40 if (len1 != len2) 40 if (len1 != len2)
41 return 0; 41 return 0;
@@ -142,15 +142,15 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
142} 142}
143 143
144static struct fileIdentDesc *udf_find_entry(struct inode *dir, 144static struct fileIdentDesc *udf_find_entry(struct inode *dir,
145 struct qstr *child, 145 const struct qstr *child,
146 struct udf_fileident_bh *fibh, 146 struct udf_fileident_bh *fibh,
147 struct fileIdentDesc *cfi) 147 struct fileIdentDesc *cfi)
148{ 148{
149 struct fileIdentDesc *fi = NULL; 149 struct fileIdentDesc *fi = NULL;
150 loff_t f_pos; 150 loff_t f_pos;
151 int block, flen; 151 int block, flen;
152 char *fname = NULL; 152 unsigned char *fname = NULL;
153 char *nameptr; 153 unsigned char *nameptr;
154 uint8_t lfi; 154 uint8_t lfi;
155 uint16_t liu; 155 uint16_t liu;
156 loff_t size; 156 loff_t size;
@@ -308,7 +308,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
308{ 308{
309 struct super_block *sb = dir->i_sb; 309 struct super_block *sb = dir->i_sb;
310 struct fileIdentDesc *fi = NULL; 310 struct fileIdentDesc *fi = NULL;
311 char *name = NULL; 311 unsigned char *name = NULL;
312 int namelen; 312 int namelen;
313 loff_t f_pos; 313 loff_t f_pos;
314 loff_t size = udf_ext0_offset(dir) + dir->i_size; 314 loff_t size = udf_ext0_offset(dir) + dir->i_size;
@@ -563,6 +563,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
563 int err; 563 int err;
564 struct udf_inode_info *iinfo; 564 struct udf_inode_info *iinfo;
565 565
566 dquot_initialize(dir);
567
566 lock_kernel(); 568 lock_kernel();
567 inode = udf_new_inode(dir, mode, &err); 569 inode = udf_new_inode(dir, mode, &err);
568 if (!inode) { 570 if (!inode) {
@@ -616,6 +618,8 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
616 if (!old_valid_dev(rdev)) 618 if (!old_valid_dev(rdev))
617 return -EINVAL; 619 return -EINVAL;
618 620
621 dquot_initialize(dir);
622
619 lock_kernel(); 623 lock_kernel();
620 err = -EIO; 624 err = -EIO;
621 inode = udf_new_inode(dir, mode, &err); 625 inode = udf_new_inode(dir, mode, &err);
@@ -662,6 +666,8 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
662 struct udf_inode_info *dinfo = UDF_I(dir); 666 struct udf_inode_info *dinfo = UDF_I(dir);
663 struct udf_inode_info *iinfo; 667 struct udf_inode_info *iinfo;
664 668
669 dquot_initialize(dir);
670
665 lock_kernel(); 671 lock_kernel();
666 err = -EMLINK; 672 err = -EMLINK;
667 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) 673 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
@@ -799,6 +805,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
799 struct fileIdentDesc *fi, cfi; 805 struct fileIdentDesc *fi, cfi;
800 struct kernel_lb_addr tloc; 806 struct kernel_lb_addr tloc;
801 807
808 dquot_initialize(dir);
809
802 retval = -ENOENT; 810 retval = -ENOENT;
803 lock_kernel(); 811 lock_kernel();
804 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 812 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -845,6 +853,8 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
845 struct fileIdentDesc cfi; 853 struct fileIdentDesc cfi;
846 struct kernel_lb_addr tloc; 854 struct kernel_lb_addr tloc;
847 855
856 dquot_initialize(dir);
857
848 retval = -ENOENT; 858 retval = -ENOENT;
849 lock_kernel(); 859 lock_kernel();
850 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 860 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -885,20 +895,22 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
885{ 895{
886 struct inode *inode; 896 struct inode *inode;
887 struct pathComponent *pc; 897 struct pathComponent *pc;
888 char *compstart; 898 const char *compstart;
889 struct udf_fileident_bh fibh; 899 struct udf_fileident_bh fibh;
890 struct extent_position epos = {}; 900 struct extent_position epos = {};
891 int eoffset, elen = 0; 901 int eoffset, elen = 0;
892 struct fileIdentDesc *fi; 902 struct fileIdentDesc *fi;
893 struct fileIdentDesc cfi; 903 struct fileIdentDesc cfi;
894 char *ea; 904 uint8_t *ea;
895 int err; 905 int err;
896 int block; 906 int block;
897 char *name = NULL; 907 unsigned char *name = NULL;
898 int namelen; 908 int namelen;
899 struct buffer_head *bh; 909 struct buffer_head *bh;
900 struct udf_inode_info *iinfo; 910 struct udf_inode_info *iinfo;
901 911
912 dquot_initialize(dir);
913
902 lock_kernel(); 914 lock_kernel();
903 inode = udf_new_inode(dir, S_IFLNK, &err); 915 inode = udf_new_inode(dir, S_IFLNK, &err);
904 if (!inode) 916 if (!inode)
@@ -913,7 +925,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
913 iinfo = UDF_I(inode); 925 iinfo = UDF_I(inode);
914 inode->i_mode = S_IFLNK | S_IRWXUGO; 926 inode->i_mode = S_IFLNK | S_IRWXUGO;
915 inode->i_data.a_ops = &udf_symlink_aops; 927 inode->i_data.a_ops = &udf_symlink_aops;
916 inode->i_op = &page_symlink_inode_operations; 928 inode->i_op = &udf_symlink_inode_operations;
917 929
918 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { 930 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
919 struct kernel_lb_addr eloc; 931 struct kernel_lb_addr eloc;
@@ -970,7 +982,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
970 982
971 pc = (struct pathComponent *)(ea + elen); 983 pc = (struct pathComponent *)(ea + elen);
972 984
973 compstart = (char *)symname; 985 compstart = symname;
974 986
975 do { 987 do {
976 symname++; 988 symname++;
@@ -1069,6 +1081,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1069 int err; 1081 int err;
1070 struct buffer_head *bh; 1082 struct buffer_head *bh;
1071 1083
1084 dquot_initialize(dir);
1085
1072 lock_kernel(); 1086 lock_kernel();
1073 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { 1087 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
1074 unlock_kernel(); 1088 unlock_kernel();
@@ -1131,6 +1145,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1131 struct kernel_lb_addr tloc; 1145 struct kernel_lb_addr tloc;
1132 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1146 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1133 1147
1148 dquot_initialize(old_dir);
1149 dquot_initialize(new_dir);
1150
1134 lock_kernel(); 1151 lock_kernel();
1135 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); 1152 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
1136 if (ofi) { 1153 if (ofi) {
@@ -1376,6 +1393,7 @@ const struct export_operations udf_export_ops = {
1376const struct inode_operations udf_dir_inode_operations = { 1393const struct inode_operations udf_dir_inode_operations = {
1377 .lookup = udf_lookup, 1394 .lookup = udf_lookup,
1378 .create = udf_create, 1395 .create = udf_create,
1396 .setattr = udf_setattr,
1379 .link = udf_link, 1397 .link = udf_link,
1380 .unlink = udf_unlink, 1398 .unlink = udf_unlink,
1381 .symlink = udf_symlink, 1399 .symlink = udf_symlink,
@@ -1384,3 +1402,9 @@ const struct inode_operations udf_dir_inode_operations = {
1384 .mknod = udf_mknod, 1402 .mknod = udf_mknod,
1385 .rename = udf_rename, 1403 .rename = udf_rename,
1386}; 1404};
1405const struct inode_operations udf_symlink_inode_operations = {
1406 .readlink = generic_readlink,
1407 .follow_link = page_follow_link_light,
1408 .put_link = page_put_link,
1409 .setattr = udf_setattr,
1410};
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 4b540ee632d5..745eb209be0c 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -24,7 +24,6 @@
24 24
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/string.h> 26#include <linux/string.h>
27#include <linux/slab.h>
28#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
29 28
30uint32_t udf_get_pblock(struct super_block *sb, uint32_t block, 29uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index c3265e1385d4..16064787d2b7 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -26,18 +26,17 @@
26#include <linux/time.h> 26#include <linux/time.h>
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/stat.h> 28#include <linux/stat.h>
29#include <linux/slab.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/buffer_head.h> 31#include <linux/buffer_head.h>
33#include "udf_i.h" 32#include "udf_i.h"
34 33
35static void udf_pc_to_char(struct super_block *sb, char *from, int fromlen, 34static void udf_pc_to_char(struct super_block *sb, unsigned char *from,
36 char *to) 35 int fromlen, unsigned char *to)
37{ 36{
38 struct pathComponent *pc; 37 struct pathComponent *pc;
39 int elen = 0; 38 int elen = 0;
40 char *p = to; 39 unsigned char *p = to;
41 40
42 while (elen < fromlen) { 41 while (elen < fromlen) {
43 pc = (struct pathComponent *)(from + elen); 42 pc = (struct pathComponent *)(from + elen);
@@ -75,9 +74,9 @@ static int udf_symlink_filler(struct file *file, struct page *page)
75{ 74{
76 struct inode *inode = page->mapping->host; 75 struct inode *inode = page->mapping->host;
77 struct buffer_head *bh = NULL; 76 struct buffer_head *bh = NULL;
78 char *symlink; 77 unsigned char *symlink;
79 int err = -EIO; 78 int err = -EIO;
80 char *p = kmap(page); 79 unsigned char *p = kmap(page);
81 struct udf_inode_info *iinfo; 80 struct udf_inode_info *iinfo;
82 81
83 lock_kernel(); 82 lock_kernel();
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8d46f4294ee7..702a1148e702 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -76,6 +76,7 @@ extern const struct inode_operations udf_dir_inode_operations;
76extern const struct file_operations udf_dir_operations; 76extern const struct file_operations udf_dir_operations;
77extern const struct inode_operations udf_file_inode_operations; 77extern const struct inode_operations udf_file_inode_operations;
78extern const struct file_operations udf_file_operations; 78extern const struct file_operations udf_file_operations;
79extern const struct inode_operations udf_symlink_inode_operations;
79extern const struct address_space_operations udf_aops; 80extern const struct address_space_operations udf_aops;
80extern const struct address_space_operations udf_adinicb_aops; 81extern const struct address_space_operations udf_adinicb_aops;
81extern const struct address_space_operations udf_symlink_aops; 82extern const struct address_space_operations udf_symlink_aops;
@@ -131,7 +132,7 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
131/* file.c */ 132/* file.c */
132extern int udf_ioctl(struct inode *, struct file *, unsigned int, 133extern int udf_ioctl(struct inode *, struct file *, unsigned int,
133 unsigned long); 134 unsigned long);
134 135extern int udf_setattr(struct dentry *dentry, struct iattr *iattr);
135/* inode.c */ 136/* inode.c */
136extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); 137extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
137extern int udf_sync_inode(struct inode *); 138extern int udf_sync_inode(struct inode *);
@@ -142,7 +143,7 @@ extern void udf_truncate(struct inode *);
142extern void udf_read_inode(struct inode *); 143extern void udf_read_inode(struct inode *);
143extern void udf_delete_inode(struct inode *); 144extern void udf_delete_inode(struct inode *);
144extern void udf_clear_inode(struct inode *); 145extern void udf_clear_inode(struct inode *);
145extern int udf_write_inode(struct inode *, int); 146extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
146extern long udf_block_map(struct inode *, sector_t); 147extern long udf_block_map(struct inode *, sector_t);
147extern int udf_extend_file(struct inode *, struct extent_position *, 148extern int udf_extend_file(struct inode *, struct extent_position *,
148 struct kernel_long_ad *, sector_t); 149 struct kernel_long_ad *, sector_t);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index cefa8c8913e6..d03a90b6ad69 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -24,6 +24,7 @@
24#include <linux/string.h> /* for memset */ 24#include <linux/string.h> /* for memset */
25#include <linux/nls.h> 25#include <linux/nls.h>
26#include <linux/crc-itu-t.h> 26#include <linux/crc-itu-t.h>
27#include <linux/slab.h>
27 28
28#include "udf_sb.h" 29#include "udf_sb.h"
29 30
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 54c16ec95dff..5cfa4d85ccf2 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -85,7 +85,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
85 "bit already cleared for fragment %u", i); 85 "bit already cleared for fragment %u", i);
86 } 86 }
87 87
88 vfs_dq_free_block(inode, count); 88 dquot_free_block(inode, count);
89 89
90 90
91 fs32_add(sb, &ucg->cg_cs.cs_nffree, count); 91 fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
@@ -195,7 +195,7 @@ do_more:
195 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 195 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
196 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 196 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
197 ufs_clusteracct (sb, ucpi, blkno, 1); 197 ufs_clusteracct (sb, ucpi, blkno, 1);
198 vfs_dq_free_block(inode, uspi->s_fpb); 198 dquot_free_block(inode, uspi->s_fpb);
199 199
200 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); 200 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
201 uspi->cs_total.cs_nbfree++; 201 uspi->cs_total.cs_nbfree++;
@@ -511,6 +511,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
511 struct ufs_cg_private_info * ucpi; 511 struct ufs_cg_private_info * ucpi;
512 struct ufs_cylinder_group * ucg; 512 struct ufs_cylinder_group * ucg;
513 unsigned cgno, fragno, fragoff, count, fragsize, i; 513 unsigned cgno, fragno, fragoff, count, fragsize, i;
514 int ret;
514 515
515 UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n", 516 UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
516 (unsigned long long)fragment, oldcount, newcount); 517 (unsigned long long)fragment, oldcount, newcount);
@@ -556,8 +557,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
556 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); 557 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
557 for (i = oldcount; i < newcount; i++) 558 for (i = oldcount; i < newcount; i++)
558 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); 559 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
559 if (vfs_dq_alloc_block(inode, count)) { 560 ret = dquot_alloc_block(inode, count);
560 *err = -EDQUOT; 561 if (ret) {
562 *err = ret;
561 return 0; 563 return 0;
562 } 564 }
563 565
@@ -596,6 +598,7 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
596 struct ufs_cylinder_group * ucg; 598 struct ufs_cylinder_group * ucg;
597 unsigned oldcg, i, j, k, allocsize; 599 unsigned oldcg, i, j, k, allocsize;
598 u64 result; 600 u64 result;
601 int ret;
599 602
600 UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", 603 UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
601 inode->i_ino, cgno, (unsigned long long)goal, count); 604 inode->i_ino, cgno, (unsigned long long)goal, count);
@@ -664,7 +667,7 @@ cg_found:
664 for (i = count; i < uspi->s_fpb; i++) 667 for (i = count; i < uspi->s_fpb; i++)
665 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); 668 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
666 i = uspi->s_fpb - count; 669 i = uspi->s_fpb - count;
667 vfs_dq_free_block(inode, i); 670 dquot_free_block(inode, i);
668 671
669 fs32_add(sb, &ucg->cg_cs.cs_nffree, i); 672 fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
670 uspi->cs_total.cs_nffree += i; 673 uspi->cs_total.cs_nffree += i;
@@ -676,8 +679,9 @@ cg_found:
676 result = ufs_bitmap_search (sb, ucpi, goal, allocsize); 679 result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
677 if (result == INVBLOCK) 680 if (result == INVBLOCK)
678 return 0; 681 return 0;
679 if (vfs_dq_alloc_block(inode, count)) { 682 ret = dquot_alloc_block(inode, count);
680 *err = -EDQUOT; 683 if (ret) {
684 *err = ret;
681 return 0; 685 return 0;
682 } 686 }
683 for (i = 0; i < count; i++) 687 for (i = 0; i < count; i++)
@@ -714,6 +718,7 @@ static u64 ufs_alloccg_block(struct inode *inode,
714 struct ufs_super_block_first * usb1; 718 struct ufs_super_block_first * usb1;
715 struct ufs_cylinder_group * ucg; 719 struct ufs_cylinder_group * ucg;
716 u64 result, blkno; 720 u64 result, blkno;
721 int ret;
717 722
718 UFSD("ENTER, goal %llu\n", (unsigned long long)goal); 723 UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
719 724
@@ -747,8 +752,9 @@ gotit:
747 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 752 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
748 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 753 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
749 ufs_clusteracct (sb, ucpi, blkno, -1); 754 ufs_clusteracct (sb, ucpi, blkno, -1);
750 if (vfs_dq_alloc_block(inode, uspi->s_fpb)) { 755 ret = dquot_alloc_block(inode, uspi->s_fpb);
751 *err = -EDQUOT; 756 if (ret) {
757 *err = ret;
752 return INVBLOCK; 758 return INVBLOCK;
753 } 759 }
754 760
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 22af68f8b682..317a0d444f6b 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -31,7 +31,7 @@
31 * len <= UFS_MAXNAMLEN and de != NULL are guaranteed by caller. 31 * len <= UFS_MAXNAMLEN and de != NULL are guaranteed by caller.
32 */ 32 */
33static inline int ufs_match(struct super_block *sb, int len, 33static inline int ufs_match(struct super_block *sb, int len,
34 const char * const name, struct ufs_dir_entry * de) 34 const unsigned char *name, struct ufs_dir_entry *de)
35{ 35{
36 if (len != ufs_get_de_namlen(sb, de)) 36 if (len != ufs_get_de_namlen(sb, de))
37 return 0; 37 return 0;
@@ -70,7 +70,7 @@ static inline unsigned long ufs_dir_pages(struct inode *inode)
70 return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; 70 return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
71} 71}
72 72
73ino_t ufs_inode_by_name(struct inode *dir, struct qstr *qstr) 73ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
74{ 74{
75 ino_t res = 0; 75 ino_t res = 0;
76 struct ufs_dir_entry *de; 76 struct ufs_dir_entry *de;
@@ -249,11 +249,11 @@ struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
249 * (as a parameter - res_dir). Page is returned mapped and unlocked. 249 * (as a parameter - res_dir). Page is returned mapped and unlocked.
250 * Entry is guaranteed to be valid. 250 * Entry is guaranteed to be valid.
251 */ 251 */
252struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct qstr *qstr, 252struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr,
253 struct page **res_page) 253 struct page **res_page)
254{ 254{
255 struct super_block *sb = dir->i_sb; 255 struct super_block *sb = dir->i_sb;
256 const char *name = qstr->name; 256 const unsigned char *name = qstr->name;
257 int namelen = qstr->len; 257 int namelen = qstr->len;
258 unsigned reclen = UFS_DIR_REC_LEN(namelen); 258 unsigned reclen = UFS_DIR_REC_LEN(namelen);
259 unsigned long start, n; 259 unsigned long start, n;
@@ -313,7 +313,7 @@ found:
313int ufs_add_link(struct dentry *dentry, struct inode *inode) 313int ufs_add_link(struct dentry *dentry, struct inode *inode)
314{ 314{
315 struct inode *dir = dentry->d_parent->d_inode; 315 struct inode *dir = dentry->d_parent->d_inode;
316 const char *name = dentry->d_name.name; 316 const unsigned char *name = dentry->d_name.name;
317 int namelen = dentry->d_name.len; 317 int namelen = dentry->d_name.len;
318 struct super_block *sb = dir->i_sb; 318 struct super_block *sb = dir->i_sb;
319 unsigned reclen = UFS_DIR_REC_LEN(namelen); 319 unsigned reclen = UFS_DIR_REC_LEN(namelen);
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 73655c61240a..a8962cecde5b 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,6 +24,7 @@
24 */ 24 */
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/quotaops.h>
27 28
28#include "ufs_fs.h" 29#include "ufs_fs.h"
29#include "ufs.h" 30#include "ufs.h"
@@ -40,7 +41,7 @@ const struct file_operations ufs_file_operations = {
40 .write = do_sync_write, 41 .write = do_sync_write,
41 .aio_write = generic_file_aio_write, 42 .aio_write = generic_file_aio_write,
42 .mmap = generic_file_mmap, 43 .mmap = generic_file_mmap,
43 .open = generic_file_open, 44 .open = dquot_file_open,
44 .fsync = simple_fsync, 45 .fsync = simple_fsync,
45 .splice_read = generic_file_splice_read, 46 .splice_read = generic_file_splice_read,
46}; 47};
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 3527c00fef0d..230ecf608026 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -95,8 +95,8 @@ void ufs_free_inode (struct inode * inode)
95 95
96 is_directory = S_ISDIR(inode->i_mode); 96 is_directory = S_ISDIR(inode->i_mode);
97 97
98 vfs_dq_free_inode(inode); 98 dquot_free_inode(inode);
99 vfs_dq_drop(inode); 99 dquot_drop(inode);
100 100
101 clear_inode (inode); 101 clear_inode (inode);
102 102
@@ -355,9 +355,10 @@ cg_found:
355 355
356 unlock_super (sb); 356 unlock_super (sb);
357 357
358 if (vfs_dq_alloc_inode(inode)) { 358 dquot_initialize(inode);
359 vfs_dq_drop(inode); 359 err = dquot_alloc_inode(inode);
360 err = -EDQUOT; 360 if (err) {
361 dquot_drop(inode);
361 goto fail_without_unlock; 362 goto fail_without_unlock;
362 } 363 }
363 364
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7cf33379fd46..80b68c3702d1 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -36,6 +36,8 @@
36#include <linux/mm.h> 36#include <linux/mm.h>
37#include <linux/smp_lock.h> 37#include <linux/smp_lock.h>
38#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
39#include <linux/writeback.h>
40#include <linux/quotaops.h>
39 41
40#include "ufs_fs.h" 42#include "ufs_fs.h"
41#include "ufs.h" 43#include "ufs.h"
@@ -890,11 +892,11 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
890 return 0; 892 return 0;
891} 893}
892 894
893int ufs_write_inode (struct inode * inode, int wait) 895int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
894{ 896{
895 int ret; 897 int ret;
896 lock_kernel(); 898 lock_kernel();
897 ret = ufs_update_inode (inode, wait); 899 ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
898 unlock_kernel(); 900 unlock_kernel();
899 return ret; 901 return ret;
900} 902}
@@ -908,6 +910,9 @@ void ufs_delete_inode (struct inode * inode)
908{ 910{
909 loff_t old_i_size; 911 loff_t old_i_size;
910 912
913 if (!is_bad_inode(inode))
914 dquot_initialize(inode);
915
911 truncate_inode_pages(&inode->i_data, 0); 916 truncate_inode_pages(&inode->i_data, 0);
912 if (is_bad_inode(inode)) 917 if (is_bad_inode(inode))
913 goto no_delete; 918 goto no_delete;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 4c26d9e8bc94..118556243e7a 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -30,6 +30,7 @@
30#include <linux/time.h> 30#include <linux/time.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/smp_lock.h> 32#include <linux/smp_lock.h>
33#include <linux/quotaops.h>
33 34
34#include "ufs_fs.h" 35#include "ufs_fs.h"
35#include "ufs.h" 36#include "ufs.h"
@@ -84,6 +85,9 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
84 int err; 85 int err;
85 86
86 UFSD("BEGIN\n"); 87 UFSD("BEGIN\n");
88
89 dquot_initialize(dir);
90
87 inode = ufs_new_inode(dir, mode); 91 inode = ufs_new_inode(dir, mode);
88 err = PTR_ERR(inode); 92 err = PTR_ERR(inode);
89 93
@@ -107,6 +111,9 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
107 111
108 if (!old_valid_dev(rdev)) 112 if (!old_valid_dev(rdev))
109 return -EINVAL; 113 return -EINVAL;
114
115 dquot_initialize(dir);
116
110 inode = ufs_new_inode(dir, mode); 117 inode = ufs_new_inode(dir, mode);
111 err = PTR_ERR(inode); 118 err = PTR_ERR(inode);
112 if (!IS_ERR(inode)) { 119 if (!IS_ERR(inode)) {
@@ -131,6 +138,8 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
131 if (l > sb->s_blocksize) 138 if (l > sb->s_blocksize)
132 goto out_notlocked; 139 goto out_notlocked;
133 140
141 dquot_initialize(dir);
142
134 lock_kernel(); 143 lock_kernel();
135 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); 144 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
136 err = PTR_ERR(inode); 145 err = PTR_ERR(inode);
@@ -176,6 +185,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
176 return -EMLINK; 185 return -EMLINK;
177 } 186 }
178 187
188 dquot_initialize(dir);
189
179 inode->i_ctime = CURRENT_TIME_SEC; 190 inode->i_ctime = CURRENT_TIME_SEC;
180 inode_inc_link_count(inode); 191 inode_inc_link_count(inode);
181 atomic_inc(&inode->i_count); 192 atomic_inc(&inode->i_count);
@@ -193,6 +204,8 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
193 if (dir->i_nlink >= UFS_LINK_MAX) 204 if (dir->i_nlink >= UFS_LINK_MAX)
194 goto out; 205 goto out;
195 206
207 dquot_initialize(dir);
208
196 lock_kernel(); 209 lock_kernel();
197 inode_inc_link_count(dir); 210 inode_inc_link_count(dir);
198 211
@@ -237,6 +250,8 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
237 struct page *page; 250 struct page *page;
238 int err = -ENOENT; 251 int err = -ENOENT;
239 252
253 dquot_initialize(dir);
254
240 de = ufs_find_entry(dir, &dentry->d_name, &page); 255 de = ufs_find_entry(dir, &dentry->d_name, &page);
241 if (!de) 256 if (!de)
242 goto out; 257 goto out;
@@ -281,6 +296,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
281 struct ufs_dir_entry *old_de; 296 struct ufs_dir_entry *old_de;
282 int err = -ENOENT; 297 int err = -ENOENT;
283 298
299 dquot_initialize(old_dir);
300 dquot_initialize(new_dir);
301
284 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); 302 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
285 if (!old_de) 303 if (!old_de)
286 goto out; 304 goto out;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 143c20bfb04b..14743d935a93 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1016,6 +1016,9 @@ magic_found:
1016 case UFS_FSSTABLE: 1016 case UFS_FSSTABLE:
1017 UFSD("fs is stable\n"); 1017 UFSD("fs is stable\n");
1018 break; 1018 break;
1019 case UFS_FSLOG:
1020 UFSD("fs is logging fs\n");
1021 break;
1019 case UFS_FSOSF1: 1022 case UFS_FSOSF1:
1020 UFSD("fs is DEC OSF/1\n"); 1023 UFSD("fs is DEC OSF/1\n");
1021 break; 1024 break;
@@ -1432,6 +1435,11 @@ static void destroy_inodecache(void)
1432 kmem_cache_destroy(ufs_inode_cachep); 1435 kmem_cache_destroy(ufs_inode_cachep);
1433} 1436}
1434 1437
1438static void ufs_clear_inode(struct inode *inode)
1439{
1440 dquot_drop(inode);
1441}
1442
1435#ifdef CONFIG_QUOTA 1443#ifdef CONFIG_QUOTA
1436static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t); 1444static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t);
1437static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t); 1445static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
@@ -1442,6 +1450,7 @@ static const struct super_operations ufs_super_ops = {
1442 .destroy_inode = ufs_destroy_inode, 1450 .destroy_inode = ufs_destroy_inode,
1443 .write_inode = ufs_write_inode, 1451 .write_inode = ufs_write_inode,
1444 .delete_inode = ufs_delete_inode, 1452 .delete_inode = ufs_delete_inode,
1453 .clear_inode = ufs_clear_inode,
1445 .put_super = ufs_put_super, 1454 .put_super = ufs_put_super,
1446 .write_super = ufs_write_super, 1455 .write_super = ufs_write_super,
1447 .sync_fs = ufs_sync_fs, 1456 .sync_fs = ufs_sync_fs,
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 41dd431ce228..d3b6270cb377 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -44,6 +44,7 @@
44#include <linux/buffer_head.h> 44#include <linux/buffer_head.h>
45#include <linux/blkdev.h> 45#include <linux/blkdev.h>
46#include <linux/sched.h> 46#include <linux/sched.h>
47#include <linux/quotaops.h>
47 48
48#include "ufs_fs.h" 49#include "ufs_fs.h"
49#include "ufs.h" 50#include "ufs.h"
@@ -517,9 +518,18 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
517 if (error) 518 if (error)
518 return error; 519 return error;
519 520
521 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
522 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
523 error = dquot_transfer(inode, attr);
524 if (error)
525 return error;
526 }
520 if (ia_valid & ATTR_SIZE && 527 if (ia_valid & ATTR_SIZE &&
521 attr->ia_size != i_size_read(inode)) { 528 attr->ia_size != i_size_read(inode)) {
522 loff_t old_i_size = inode->i_size; 529 loff_t old_i_size = inode->i_size;
530
531 dquot_initialize(inode);
532
523 error = vmtruncate(inode, attr->ia_size); 533 error = vmtruncate(inode, attr->ia_size);
524 if (error) 534 if (error)
525 return error; 535 return error;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 0b4c39bc0d9e..43f9f5d5670e 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -86,9 +86,9 @@ extern void ufs_put_cylinder (struct super_block *, unsigned);
86/* dir.c */ 86/* dir.c */
87extern const struct inode_operations ufs_dir_inode_operations; 87extern const struct inode_operations ufs_dir_inode_operations;
88extern int ufs_add_link (struct dentry *, struct inode *); 88extern int ufs_add_link (struct dentry *, struct inode *);
89extern ino_t ufs_inode_by_name(struct inode *, struct qstr *); 89extern ino_t ufs_inode_by_name(struct inode *, const struct qstr *);
90extern int ufs_make_empty(struct inode *, struct inode *); 90extern int ufs_make_empty(struct inode *, struct inode *);
91extern struct ufs_dir_entry *ufs_find_entry(struct inode *, struct qstr *, struct page **); 91extern struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct page **);
92extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *); 92extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *);
93extern int ufs_empty_dir (struct inode *); 93extern int ufs_empty_dir (struct inode *);
94extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); 94extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **);
@@ -106,7 +106,7 @@ extern struct inode * ufs_new_inode (struct inode *, int);
106 106
107/* inode.c */ 107/* inode.c */
108extern struct inode *ufs_iget(struct super_block *, unsigned long); 108extern struct inode *ufs_iget(struct super_block *, unsigned long);
109extern int ufs_write_inode (struct inode *, int); 109extern int ufs_write_inode (struct inode *, struct writeback_control *);
110extern int ufs_sync_inode (struct inode *); 110extern int ufs_sync_inode (struct inode *);
111extern void ufs_delete_inode (struct inode *); 111extern void ufs_delete_inode (struct inode *);
112extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *); 112extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 54bde1895a80..6943ec677c0b 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -138,6 +138,7 @@ typedef __u16 __bitwise __fs16;
138 138
139#define UFS_USEEFT ((__u16)65535) 139#define UFS_USEEFT ((__u16)65535)
140 140
141/* fs_clean values */
141#define UFS_FSOK 0x7c269d38 142#define UFS_FSOK 0x7c269d38
142#define UFS_FSACTIVE ((__s8)0x00) 143#define UFS_FSACTIVE ((__s8)0x00)
143#define UFS_FSCLEAN ((__s8)0x01) 144#define UFS_FSCLEAN ((__s8)0x01)
@@ -145,6 +146,11 @@ typedef __u16 __bitwise __fs16;
145#define UFS_FSOSF1 ((__s8)0x03) /* is this correct for DEC OSF/1? */ 146#define UFS_FSOSF1 ((__s8)0x03) /* is this correct for DEC OSF/1? */
146#define UFS_FSBAD ((__s8)0xff) 147#define UFS_FSBAD ((__s8)0xff)
147 148
149/* Solaris-specific fs_clean values */
150#define UFS_FSSUSPEND ((__s8)0xfe) /* temporarily suspended */
151#define UFS_FSLOG ((__s8)0xfd) /* logging fs */
152#define UFS_FSFIX ((__s8)0xfc) /* being repaired while mounted */
153
148/* From here to next blank line, s_flags for ufs_sb_info */ 154/* From here to next blank line, s_flags for ufs_sb_info */
149/* directory entry encoding */ 155/* directory entry encoding */
150#define UFS_DE_MASK 0x00000010 /* mask for the following */ 156#define UFS_DE_MASK 0x00000010 /* mask for the following */
@@ -227,11 +233,16 @@ typedef __u16 __bitwise __fs16;
227 */ 233 */
228#define ufs_cbtocylno(bno) \ 234#define ufs_cbtocylno(bno) \
229 ((bno) * uspi->s_nspf / uspi->s_spc) 235 ((bno) * uspi->s_nspf / uspi->s_spc)
230#define ufs_cbtorpos(bno) \ 236#define ufs_cbtorpos(bno) \
237 ((UFS_SB(sb)->s_flags & UFS_CG_SUN) ? \
238 (((((bno) * uspi->s_nspf % uspi->s_spc) % \
239 uspi->s_nsect) * \
240 uspi->s_nrpos) / uspi->s_nsect) \
241 : \
231 ((((bno) * uspi->s_nspf % uspi->s_spc / uspi->s_nsect \ 242 ((((bno) * uspi->s_nspf % uspi->s_spc / uspi->s_nsect \
232 * uspi->s_trackskew + (bno) * uspi->s_nspf % uspi->s_spc \ 243 * uspi->s_trackskew + (bno) * uspi->s_nspf % uspi->s_spc \
233 % uspi->s_nsect * uspi->s_interleave) % uspi->s_nsect \ 244 % uspi->s_nsect * uspi->s_interleave) % uspi->s_nsect \
234 * uspi->s_nrpos) / uspi->s_npsect) 245 * uspi->s_nrpos) / uspi->s_npsect))
235 246
236/* 247/*
237 * The following macros optimize certain frequently calculated 248 * The following macros optimize certain frequently calculated
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index 05ac0fe9c4d3..8d5a506c82eb 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -6,9 +6,9 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/slab.h>
10#include <linux/fs.h> 9#include <linux/fs.h>
11#include <linux/posix_acl_xattr.h> 10#include <linux/posix_acl_xattr.h>
11#include <linux/gfp.h>
12 12
13 13
14/* 14/*
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 56641fe52a23..b4769e40e8bc 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,7 +16,7 @@
16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17# 17#
18 18
19EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6 -funsigned-char 19EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6
20 20
21XFS_LINUX := linux-2.6 21XFS_LINUX := linux-2.6
22 22
@@ -105,7 +105,6 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
105 xfs_globals.o \ 105 xfs_globals.o \
106 xfs_ioctl.o \ 106 xfs_ioctl.o \
107 xfs_iops.o \ 107 xfs_iops.o \
108 xfs_lrw.o \
109 xfs_super.o \ 108 xfs_super.o \
110 xfs_sync.o \ 109 xfs_sync.o \
111 xfs_xattr.o) 110 xfs_xattr.o)
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 2d3f90afe5f1..666c9db48eb6 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -16,16 +16,33 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/vmalloc.h>
20#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <linux/slab.h>
21#include <linux/swap.h> 21#include <linux/swap.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include "time.h" 24#include "time.h"
25#include "kmem.h" 25#include "kmem.h"
26 26
27#define MAX_VMALLOCS 6 27/*
28#define MAX_SLAB_SIZE 0x20000 28 * Greedy allocation. May fail and may return vmalloced memory.
29 *
30 * Must be freed using kmem_free_large.
31 */
32void *
33kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
34{
35 void *ptr;
36 size_t kmsize = maxsize;
37
38 while (!(ptr = kmem_zalloc_large(kmsize))) {
39 if ((kmsize >>= 1) <= minsize)
40 kmsize = minsize;
41 }
42 if (ptr)
43 *size = kmsize;
44 return ptr;
45}
29 46
30void * 47void *
31kmem_alloc(size_t size, unsigned int __nocast flags) 48kmem_alloc(size_t size, unsigned int __nocast flags)
@@ -34,19 +51,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
34 gfp_t lflags = kmem_flags_convert(flags); 51 gfp_t lflags = kmem_flags_convert(flags);
35 void *ptr; 52 void *ptr;
36 53
37#ifdef DEBUG
38 if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) {
39 printk(KERN_WARNING "Large %s attempt, size=%ld\n",
40 __func__, (long)size);
41 dump_stack();
42 }
43#endif
44
45 do { 54 do {
46 if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS) 55 ptr = kmalloc(size, lflags);
47 ptr = kmalloc(size, lflags);
48 else
49 ptr = __vmalloc(size, lflags, PAGE_KERNEL);
50 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 56 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
51 return ptr; 57 return ptr;
52 if (!(++retries % 100)) 58 if (!(++retries % 100))
@@ -68,27 +74,6 @@ kmem_zalloc(size_t size, unsigned int __nocast flags)
68 return ptr; 74 return ptr;
69} 75}
70 76
71void *
72kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize,
73 unsigned int __nocast flags)
74{
75 void *ptr;
76 size_t kmsize = maxsize;
77 unsigned int kmflags = (flags & ~KM_SLEEP) | KM_NOSLEEP;
78
79 while (!(ptr = kmem_zalloc(kmsize, kmflags))) {
80 if ((kmsize <= minsize) && (flags & KM_NOSLEEP))
81 break;
82 if ((kmsize >>= 1) <= minsize) {
83 kmsize = minsize;
84 kmflags = flags;
85 }
86 }
87 if (ptr)
88 *size = kmsize;
89 return ptr;
90}
91
92void 77void
93kmem_free(const void *ptr) 78kmem_free(const void *ptr)
94{ 79{
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 179cbd630f69..f7c8f7a9ea6d 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -21,6 +21,7 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/vmalloc.h>
24 25
25/* 26/*
26 * General memory allocation interfaces 27 * General memory allocation interfaces
@@ -30,7 +31,6 @@
30#define KM_NOSLEEP 0x0002u 31#define KM_NOSLEEP 0x0002u
31#define KM_NOFS 0x0004u 32#define KM_NOFS 0x0004u
32#define KM_MAYFAIL 0x0008u 33#define KM_MAYFAIL 0x0008u
33#define KM_LARGE 0x0010u
34 34
35/* 35/*
36 * We use a special process flag to avoid recursive callbacks into 36 * We use a special process flag to avoid recursive callbacks into
@@ -42,7 +42,7 @@ kmem_flags_convert(unsigned int __nocast flags)
42{ 42{
43 gfp_t lflags; 43 gfp_t lflags;
44 44
45 BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_LARGE)); 45 BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
46 46
47 if (flags & KM_NOSLEEP) { 47 if (flags & KM_NOSLEEP) {
48 lflags = GFP_ATOMIC | __GFP_NOWARN; 48 lflags = GFP_ATOMIC | __GFP_NOWARN;
@@ -56,10 +56,25 @@ kmem_flags_convert(unsigned int __nocast flags)
56 56
57extern void *kmem_alloc(size_t, unsigned int __nocast); 57extern void *kmem_alloc(size_t, unsigned int __nocast);
58extern void *kmem_zalloc(size_t, unsigned int __nocast); 58extern void *kmem_zalloc(size_t, unsigned int __nocast);
59extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast);
60extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast); 59extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
61extern void kmem_free(const void *); 60extern void kmem_free(const void *);
62 61
62static inline void *kmem_zalloc_large(size_t size)
63{
64 void *ptr;
65
66 ptr = vmalloc(size);
67 if (ptr)
68 memset(ptr, 0, size);
69 return ptr;
70}
71static inline void kmem_free_large(void *ptr)
72{
73 vfree(ptr);
74}
75
76extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
77
63/* 78/*
64 * Zone interfaces 79 * Zone interfaces
65 */ 80 */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index 2512125dfa7c..a7bc925c4d60 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -22,6 +22,7 @@
22#include "xfs_inode.h" 22#include "xfs_inode.h"
23#include "xfs_vnodeops.h" 23#include "xfs_vnodeops.h"
24#include "xfs_trace.h" 24#include "xfs_trace.h"
25#include <linux/slab.h>
25#include <linux/xattr.h> 26#include <linux/xattr.h>
26#include <linux/posix_acl_xattr.h> 27#include <linux/posix_acl_xattr.h>
27 28
@@ -106,7 +107,7 @@ xfs_get_acl(struct inode *inode, int type)
106 struct posix_acl *acl; 107 struct posix_acl *acl;
107 struct xfs_acl *xfs_acl; 108 struct xfs_acl *xfs_acl;
108 int len = sizeof(struct xfs_acl); 109 int len = sizeof(struct xfs_acl);
109 char *ea_name; 110 unsigned char *ea_name;
110 int error; 111 int error;
111 112
112 acl = get_cached_acl(inode, type); 113 acl = get_cached_acl(inode, type);
@@ -133,7 +134,8 @@ xfs_get_acl(struct inode *inode, int type)
133 if (!xfs_acl) 134 if (!xfs_acl)
134 return ERR_PTR(-ENOMEM); 135 return ERR_PTR(-ENOMEM);
135 136
136 error = -xfs_attr_get(ip, ea_name, (char *)xfs_acl, &len, ATTR_ROOT); 137 error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
138 &len, ATTR_ROOT);
137 if (error) { 139 if (error) {
138 /* 140 /*
139 * If the attribute doesn't exist make sure we have a negative 141 * If the attribute doesn't exist make sure we have a negative
@@ -162,7 +164,7 @@ STATIC int
162xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) 164xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
163{ 165{
164 struct xfs_inode *ip = XFS_I(inode); 166 struct xfs_inode *ip = XFS_I(inode);
165 char *ea_name; 167 unsigned char *ea_name;
166 int error; 168 int error;
167 169
168 if (S_ISLNK(inode->i_mode)) 170 if (S_ISLNK(inode->i_mode))
@@ -194,7 +196,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
194 (sizeof(struct xfs_acl_entry) * 196 (sizeof(struct xfs_acl_entry) *
195 (XFS_ACL_MAX_ENTRIES - acl->a_count)); 197 (XFS_ACL_MAX_ENTRIES - acl->a_count));
196 198
197 error = -xfs_attr_set(ip, ea_name, (char *)xfs_acl, 199 error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
198 len, ATTR_ROOT); 200 len, ATTR_ROOT);
199 201
200 kfree(xfs_acl); 202 kfree(xfs_acl);
@@ -251,8 +253,9 @@ xfs_set_mode(struct inode *inode, mode_t mode)
251 if (mode != inode->i_mode) { 253 if (mode != inode->i_mode) {
252 struct iattr iattr; 254 struct iattr iattr;
253 255
254 iattr.ia_valid = ATTR_MODE; 256 iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
255 iattr.ia_mode = mode; 257 iattr.ia_mode = mode;
258 iattr.ia_ctime = current_fs_time(inode->i_sb);
256 259
257 error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL); 260 error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
258 } 261 }
@@ -261,7 +264,7 @@ xfs_set_mode(struct inode *inode, mode_t mode)
261} 264}
262 265
263static int 266static int
264xfs_acl_exists(struct inode *inode, char *name) 267xfs_acl_exists(struct inode *inode, unsigned char *name)
265{ 268{
266 int len = sizeof(struct xfs_acl); 269 int len = sizeof(struct xfs_acl);
267 270
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 66abe36c1213..0f8b9968a803 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -39,6 +39,8 @@
39#include "xfs_iomap.h" 39#include "xfs_iomap.h"
40#include "xfs_vnodeops.h" 40#include "xfs_vnodeops.h"
41#include "xfs_trace.h" 41#include "xfs_trace.h"
42#include "xfs_bmap.h"
43#include <linux/gfp.h>
42#include <linux/mpage.h> 44#include <linux/mpage.h>
43#include <linux/pagevec.h> 45#include <linux/pagevec.h>
44#include <linux/writeback.h> 46#include <linux/writeback.h>
@@ -163,14 +165,17 @@ xfs_ioend_new_eof(
163} 165}
164 166
165/* 167/*
166 * Update on-disk file size now that data has been written to disk. 168 * Update on-disk file size now that data has been written to disk. The
167 * The current in-memory file size is i_size. If a write is beyond 169 * current in-memory file size is i_size. If a write is beyond eof i_new_size
168 * eof i_new_size will be the intended file size until i_size is 170 * will be the intended file size until i_size is updated. If this write does
169 * updated. If this write does not extend all the way to the valid 171 * not extend all the way to the valid file size then restrict this update to
170 * file size then restrict this update to the end of the write. 172 * the end of the write.
173 *
174 * This function does not block as blocking on the inode lock in IO completion
175 * can lead to IO completion order dependency deadlocks.. If it can't get the
176 * inode ilock it will return EAGAIN. Callers must handle this.
171 */ 177 */
172 178STATIC int
173STATIC void
174xfs_setfilesize( 179xfs_setfilesize(
175 xfs_ioend_t *ioend) 180 xfs_ioend_t *ioend)
176{ 181{
@@ -181,16 +186,40 @@ xfs_setfilesize(
181 ASSERT(ioend->io_type != IOMAP_READ); 186 ASSERT(ioend->io_type != IOMAP_READ);
182 187
183 if (unlikely(ioend->io_error)) 188 if (unlikely(ioend->io_error))
184 return; 189 return 0;
190
191 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
192 return EAGAIN;
185 193
186 xfs_ilock(ip, XFS_ILOCK_EXCL);
187 isize = xfs_ioend_new_eof(ioend); 194 isize = xfs_ioend_new_eof(ioend);
188 if (isize) { 195 if (isize) {
189 ip->i_d.di_size = isize; 196 ip->i_d.di_size = isize;
190 xfs_mark_inode_dirty_sync(ip); 197 xfs_mark_inode_dirty(ip);
191 } 198 }
192 199
193 xfs_iunlock(ip, XFS_ILOCK_EXCL); 200 xfs_iunlock(ip, XFS_ILOCK_EXCL);
201 return 0;
202}
203
204/*
205 * Schedule IO completion handling on a xfsdatad if this was
206 * the final hold on this ioend. If we are asked to wait,
207 * flush the workqueue.
208 */
209STATIC void
210xfs_finish_ioend(
211 xfs_ioend_t *ioend,
212 int wait)
213{
214 if (atomic_dec_and_test(&ioend->io_remaining)) {
215 struct workqueue_struct *wq;
216
217 wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
218 xfsconvertd_workqueue : xfsdatad_workqueue;
219 queue_work(wq, &ioend->io_work);
220 if (wait)
221 flush_workqueue(wq);
222 }
194} 223}
195 224
196/* 225/*
@@ -198,11 +227,11 @@ xfs_setfilesize(
198 */ 227 */
199STATIC void 228STATIC void
200xfs_end_io( 229xfs_end_io(
201 struct work_struct *work) 230 struct work_struct *work)
202{ 231{
203 xfs_ioend_t *ioend = 232 xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);
204 container_of(work, xfs_ioend_t, io_work); 233 struct xfs_inode *ip = XFS_I(ioend->io_inode);
205 struct xfs_inode *ip = XFS_I(ioend->io_inode); 234 int error = 0;
206 235
207 /* 236 /*
208 * For unwritten extents we need to issue transactions to convert a 237 * For unwritten extents we need to issue transactions to convert a
@@ -210,7 +239,6 @@ xfs_end_io(
210 */ 239 */
211 if (ioend->io_type == IOMAP_UNWRITTEN && 240 if (ioend->io_type == IOMAP_UNWRITTEN &&
212 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { 241 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
213 int error;
214 242
215 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 243 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
216 ioend->io_size); 244 ioend->io_size);
@@ -222,30 +250,23 @@ xfs_end_io(
222 * We might have to update the on-disk file size after extending 250 * We might have to update the on-disk file size after extending
223 * writes. 251 * writes.
224 */ 252 */
225 if (ioend->io_type != IOMAP_READ) 253 if (ioend->io_type != IOMAP_READ) {
226 xfs_setfilesize(ioend); 254 error = xfs_setfilesize(ioend);
227 xfs_destroy_ioend(ioend); 255 ASSERT(!error || error == EAGAIN);
228}
229
230/*
231 * Schedule IO completion handling on a xfsdatad if this was
232 * the final hold on this ioend. If we are asked to wait,
233 * flush the workqueue.
234 */
235STATIC void
236xfs_finish_ioend(
237 xfs_ioend_t *ioend,
238 int wait)
239{
240 if (atomic_dec_and_test(&ioend->io_remaining)) {
241 struct workqueue_struct *wq;
242
243 wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
244 xfsconvertd_workqueue : xfsdatad_workqueue;
245 queue_work(wq, &ioend->io_work);
246 if (wait)
247 flush_workqueue(wq);
248 } 256 }
257
258 /*
259 * If we didn't complete processing of the ioend, requeue it to the
260 * tail of the workqueue for another attempt later. Otherwise destroy
261 * it.
262 */
263 if (error == EAGAIN) {
264 atomic_inc(&ioend->io_remaining);
265 xfs_finish_ioend(ioend, 0);
266 /* ensure we don't spin on blocked ioends */
267 delay(1);
268 } else
269 xfs_destroy_ioend(ioend);
249} 270}
250 271
251/* 272/*
@@ -341,7 +362,7 @@ xfs_submit_ioend_bio(
341 * but don't update the inode size until I/O completion. 362 * but don't update the inode size until I/O completion.
342 */ 363 */
343 if (xfs_ioend_new_eof(ioend)) 364 if (xfs_ioend_new_eof(ioend))
344 xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode)); 365 xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
345 366
346 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? 367 submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
347 WRITE_SYNC_PLUG : WRITE, bio); 368 WRITE_SYNC_PLUG : WRITE, bio);
@@ -874,6 +895,125 @@ xfs_cluster_write(
874 } 895 }
875} 896}
876 897
898STATIC void
899xfs_vm_invalidatepage(
900 struct page *page,
901 unsigned long offset)
902{
903 trace_xfs_invalidatepage(page->mapping->host, page, offset);
904 block_invalidatepage(page, offset);
905}
906
907/*
908 * If the page has delalloc buffers on it, we need to punch them out before we
909 * invalidate the page. If we don't, we leave a stale delalloc mapping on the
910 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
911 * is done on that same region - the delalloc extent is returned when none is
912 * supposed to be there.
913 *
914 * We prevent this by truncating away the delalloc regions on the page before
915 * invalidating it. Because they are delalloc, we can do this without needing a
916 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
917 * truncation without a transaction as there is no space left for block
918 * reservation (typically why we see a ENOSPC in writeback).
919 *
920 * This is not a performance critical path, so for now just do the punching a
921 * buffer head at a time.
922 */
923STATIC void
924xfs_aops_discard_page(
925 struct page *page)
926{
927 struct inode *inode = page->mapping->host;
928 struct xfs_inode *ip = XFS_I(inode);
929 struct buffer_head *bh, *head;
930 loff_t offset = page_offset(page);
931 ssize_t len = 1 << inode->i_blkbits;
932
933 if (!xfs_is_delayed_page(page, IOMAP_DELAY))
934 goto out_invalidate;
935
936 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
937 goto out_invalidate;
938
939 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
940 "page discard on page %p, inode 0x%llx, offset %llu.",
941 page, ip->i_ino, offset);
942
943 xfs_ilock(ip, XFS_ILOCK_EXCL);
944 bh = head = page_buffers(page);
945 do {
946 int done;
947 xfs_fileoff_t offset_fsb;
948 xfs_bmbt_irec_t imap;
949 int nimaps = 1;
950 int error;
951 xfs_fsblock_t firstblock;
952 xfs_bmap_free_t flist;
953
954 if (!buffer_delay(bh))
955 goto next_buffer;
956
957 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
958
959 /*
960 * Map the range first and check that it is a delalloc extent
961 * before trying to unmap the range. Otherwise we will be
962 * trying to remove a real extent (which requires a
963 * transaction) or a hole, which is probably a bad idea...
964 */
965 error = xfs_bmapi(NULL, ip, offset_fsb, 1,
966 XFS_BMAPI_ENTIRE, NULL, 0, &imap,
967 &nimaps, NULL, NULL);
968
969 if (error) {
970 /* something screwed, just bail */
971 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
972 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
973 "page discard failed delalloc mapping lookup.");
974 }
975 break;
976 }
977 if (!nimaps) {
978 /* nothing there */
979 goto next_buffer;
980 }
981 if (imap.br_startblock != DELAYSTARTBLOCK) {
982 /* been converted, ignore */
983 goto next_buffer;
984 }
985 WARN_ON(imap.br_blockcount == 0);
986
987 /*
988 * Note: while we initialise the firstblock/flist pair, they
989 * should never be used because blocks should never be
990 * allocated or freed for a delalloc extent and hence we need
991 * don't cancel or finish them after the xfs_bunmapi() call.
992 */
993 xfs_bmap_init(&flist, &firstblock);
994 error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
995 &flist, NULL, &done);
996
997 ASSERT(!flist.xbf_count && !flist.xbf_first);
998 if (error) {
999 /* something screwed, just bail */
1000 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1001 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
1002 "page discard unable to remove delalloc mapping.");
1003 }
1004 break;
1005 }
1006next_buffer:
1007 offset += len;
1008
1009 } while ((bh = bh->b_this_page) != head);
1010
1011 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1012out_invalidate:
1013 xfs_vm_invalidatepage(page, 0);
1014 return;
1015}
1016
877/* 1017/*
878 * Calling this without startio set means we are being asked to make a dirty 1018 * Calling this without startio set means we are being asked to make a dirty
879 * page ready for freeing it's buffers. When called with startio set then 1019 * page ready for freeing it's buffers. When called with startio set then
@@ -1125,7 +1265,7 @@ error:
1125 */ 1265 */
1126 if (err != -EAGAIN) { 1266 if (err != -EAGAIN) {
1127 if (!unmapped) 1267 if (!unmapped)
1128 block_invalidatepage(page, 0); 1268 xfs_aops_discard_page(page);
1129 ClearPageUptodate(page); 1269 ClearPageUptodate(page);
1130 } 1270 }
1131 return err; 1271 return err;
@@ -1535,15 +1675,6 @@ xfs_vm_readpages(
1535 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1675 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1536} 1676}
1537 1677
1538STATIC void
1539xfs_vm_invalidatepage(
1540 struct page *page,
1541 unsigned long offset)
1542{
1543 trace_xfs_invalidatepage(page->mapping->host, page, offset);
1544 block_invalidatepage(page, offset);
1545}
1546
1547const struct address_space_operations xfs_address_space_operations = { 1678const struct address_space_operations xfs_address_space_operations = {
1548 .readpage = xfs_vm_readpage, 1679 .readpage = xfs_vm_readpage,
1549 .readpages = xfs_vm_readpages, 1680 .readpages = xfs_vm_readpages,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 77b8be81c769..44c2b0ef9a41 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -18,7 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include <linux/stddef.h> 19#include <linux/stddef.h>
20#include <linux/errno.h> 20#include <linux/errno.h>
21#include <linux/slab.h> 21#include <linux/gfp.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
@@ -33,6 +33,7 @@
33#include <linux/migrate.h> 33#include <linux/migrate.h>
34#include <linux/backing-dev.h> 34#include <linux/backing-dev.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36#include <linux/list_sort.h>
36 37
37#include "xfs_sb.h" 38#include "xfs_sb.h"
38#include "xfs_inum.h" 39#include "xfs_inum.h"
@@ -76,6 +77,27 @@ struct workqueue_struct *xfsconvertd_workqueue;
76#define xfs_buf_deallocate(bp) \ 77#define xfs_buf_deallocate(bp) \
77 kmem_zone_free(xfs_buf_zone, (bp)); 78 kmem_zone_free(xfs_buf_zone, (bp));
78 79
80static inline int
81xfs_buf_is_vmapped(
82 struct xfs_buf *bp)
83{
84 /*
85 * Return true if the buffer is vmapped.
86 *
87 * The XBF_MAPPED flag is set if the buffer should be mapped, but the
88 * code is clever enough to know it doesn't have to map a single page,
89 * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
90 */
91 return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
92}
93
94static inline int
95xfs_buf_vmap_len(
96 struct xfs_buf *bp)
97{
98 return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
99}
100
79/* 101/*
80 * Page Region interfaces. 102 * Page Region interfaces.
81 * 103 *
@@ -146,75 +168,6 @@ test_page_region(
146} 168}
147 169
148/* 170/*
149 * Mapping of multi-page buffers into contiguous virtual space
150 */
151
152typedef struct a_list {
153 void *vm_addr;
154 struct a_list *next;
155} a_list_t;
156
157static a_list_t *as_free_head;
158static int as_list_len;
159static DEFINE_SPINLOCK(as_lock);
160
161/*
162 * Try to batch vunmaps because they are costly.
163 */
164STATIC void
165free_address(
166 void *addr)
167{
168 a_list_t *aentry;
169
170#ifdef CONFIG_XEN
171 /*
172 * Xen needs to be able to make sure it can get an exclusive
173 * RO mapping of pages it wants to turn into a pagetable. If
174 * a newly allocated page is also still being vmap()ed by xfs,
175 * it will cause pagetable construction to fail. This is a
176 * quick workaround to always eagerly unmap pages so that Xen
177 * is happy.
178 */
179 vunmap(addr);
180 return;
181#endif
182
183 aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
184 if (likely(aentry)) {
185 spin_lock(&as_lock);
186 aentry->next = as_free_head;
187 aentry->vm_addr = addr;
188 as_free_head = aentry;
189 as_list_len++;
190 spin_unlock(&as_lock);
191 } else {
192 vunmap(addr);
193 }
194}
195
196STATIC void
197purge_addresses(void)
198{
199 a_list_t *aentry, *old;
200
201 if (as_free_head == NULL)
202 return;
203
204 spin_lock(&as_lock);
205 aentry = as_free_head;
206 as_free_head = NULL;
207 as_list_len = 0;
208 spin_unlock(&as_lock);
209
210 while ((old = aentry) != NULL) {
211 vunmap(aentry->vm_addr);
212 aentry = aentry->next;
213 kfree(old);
214 }
215}
216
217/*
218 * Internal xfs_buf_t object manipulation 171 * Internal xfs_buf_t object manipulation
219 */ 172 */
220 173
@@ -314,8 +267,9 @@ xfs_buf_free(
314 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 267 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
315 uint i; 268 uint i;
316 269
317 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) 270 if (xfs_buf_is_vmapped(bp))
318 free_address(bp->b_addr - bp->b_offset); 271 vm_unmap_ram(bp->b_addr - bp->b_offset,
272 bp->b_page_count);
319 273
320 for (i = 0; i < bp->b_page_count; i++) { 274 for (i = 0; i < bp->b_page_count; i++) {
321 struct page *page = bp->b_pages[i]; 275 struct page *page = bp->b_pages[i];
@@ -435,10 +389,8 @@ _xfs_buf_map_pages(
435 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 389 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
436 bp->b_flags |= XBF_MAPPED; 390 bp->b_flags |= XBF_MAPPED;
437 } else if (flags & XBF_MAPPED) { 391 } else if (flags & XBF_MAPPED) {
438 if (as_list_len > 64) 392 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
439 purge_addresses(); 393 -1, PAGE_KERNEL);
440 bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
441 VM_MAP, PAGE_KERNEL);
442 if (unlikely(bp->b_addr == NULL)) 394 if (unlikely(bp->b_addr == NULL))
443 return -ENOMEM; 395 return -ENOMEM;
444 bp->b_addr += bp->b_offset; 396 bp->b_addr += bp->b_offset;
@@ -1051,22 +1003,30 @@ xfs_buf_ioerror(
1051} 1003}
1052 1004
1053int 1005int
1054xfs_bawrite( 1006xfs_bwrite(
1055 void *mp, 1007 struct xfs_mount *mp,
1056 struct xfs_buf *bp) 1008 struct xfs_buf *bp)
1057{ 1009{
1058 trace_xfs_buf_bawrite(bp, _RET_IP_); 1010 int iowait = (bp->b_flags & XBF_ASYNC) == 0;
1011 int error = 0;
1059 1012
1060 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); 1013 bp->b_strat = xfs_bdstrat_cb;
1014 bp->b_mount = mp;
1015 bp->b_flags |= XBF_WRITE;
1016 if (!iowait)
1017 bp->b_flags |= _XBF_RUN_QUEUES;
1061 1018
1062 xfs_buf_delwri_dequeue(bp); 1019 xfs_buf_delwri_dequeue(bp);
1020 xfs_buf_iostrategy(bp);
1063 1021
1064 bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD); 1022 if (iowait) {
1065 bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES); 1023 error = xfs_buf_iowait(bp);
1024 if (error)
1025 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1026 xfs_buf_relse(bp);
1027 }
1066 1028
1067 bp->b_mount = mp; 1029 return error;
1068 bp->b_strat = xfs_bdstrat_cb;
1069 return xfs_bdstrat_cb(bp);
1070} 1030}
1071 1031
1072void 1032void
@@ -1085,6 +1045,126 @@ xfs_bdwrite(
1085 xfs_buf_delwri_queue(bp, 1); 1045 xfs_buf_delwri_queue(bp, 1);
1086} 1046}
1087 1047
1048/*
1049 * Called when we want to stop a buffer from getting written or read.
1050 * We attach the EIO error, muck with its flags, and call biodone
1051 * so that the proper iodone callbacks get called.
1052 */
1053STATIC int
1054xfs_bioerror(
1055 xfs_buf_t *bp)
1056{
1057#ifdef XFSERRORDEBUG
1058 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
1059#endif
1060
1061 /*
1062 * No need to wait until the buffer is unpinned, we aren't flushing it.
1063 */
1064 XFS_BUF_ERROR(bp, EIO);
1065
1066 /*
1067 * We're calling biodone, so delete XBF_DONE flag.
1068 */
1069 XFS_BUF_UNREAD(bp);
1070 XFS_BUF_UNDELAYWRITE(bp);
1071 XFS_BUF_UNDONE(bp);
1072 XFS_BUF_STALE(bp);
1073
1074 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
1075 xfs_biodone(bp);
1076
1077 return EIO;
1078}
1079
1080/*
1081 * Same as xfs_bioerror, except that we are releasing the buffer
1082 * here ourselves, and avoiding the biodone call.
1083 * This is meant for userdata errors; metadata bufs come with
1084 * iodone functions attached, so that we can track down errors.
1085 */
1086STATIC int
1087xfs_bioerror_relse(
1088 struct xfs_buf *bp)
1089{
1090 int64_t fl = XFS_BUF_BFLAGS(bp);
1091 /*
1092 * No need to wait until the buffer is unpinned.
1093 * We aren't flushing it.
1094 *
1095 * chunkhold expects B_DONE to be set, whether
1096 * we actually finish the I/O or not. We don't want to
1097 * change that interface.
1098 */
1099 XFS_BUF_UNREAD(bp);
1100 XFS_BUF_UNDELAYWRITE(bp);
1101 XFS_BUF_DONE(bp);
1102 XFS_BUF_STALE(bp);
1103 XFS_BUF_CLR_IODONE_FUNC(bp);
1104 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
1105 if (!(fl & XBF_ASYNC)) {
1106 /*
1107 * Mark b_error and B_ERROR _both_.
1108 * Lot's of chunkcache code assumes that.
1109 * There's no reason to mark error for
1110 * ASYNC buffers.
1111 */
1112 XFS_BUF_ERROR(bp, EIO);
1113 XFS_BUF_FINISH_IOWAIT(bp);
1114 } else {
1115 xfs_buf_relse(bp);
1116 }
1117
1118 return EIO;
1119}
1120
1121
1122/*
1123 * All xfs metadata buffers except log state machine buffers
1124 * get this attached as their b_bdstrat callback function.
1125 * This is so that we can catch a buffer
1126 * after prematurely unpinning it to forcibly shutdown the filesystem.
1127 */
1128int
1129xfs_bdstrat_cb(
1130 struct xfs_buf *bp)
1131{
1132 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
1133 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1134 /*
1135 * Metadata write that didn't get logged but
1136 * written delayed anyway. These aren't associated
1137 * with a transaction, and can be ignored.
1138 */
1139 if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
1140 return xfs_bioerror_relse(bp);
1141 else
1142 return xfs_bioerror(bp);
1143 }
1144
1145 xfs_buf_iorequest(bp);
1146 return 0;
1147}
1148
1149/*
1150 * Wrapper around bdstrat so that we can stop data from going to disk in case
1151 * we are shutting down the filesystem. Typically user data goes thru this
1152 * path; one of the exceptions is the superblock.
1153 */
1154void
1155xfsbdstrat(
1156 struct xfs_mount *mp,
1157 struct xfs_buf *bp)
1158{
1159 if (XFS_FORCED_SHUTDOWN(mp)) {
1160 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1161 xfs_bioerror_relse(bp);
1162 return;
1163 }
1164
1165 xfs_buf_iorequest(bp);
1166}
1167
1088STATIC void 1168STATIC void
1089_xfs_buf_ioend( 1169_xfs_buf_ioend(
1090 xfs_buf_t *bp, 1170 xfs_buf_t *bp,
@@ -1107,6 +1187,9 @@ xfs_buf_bio_end_io(
1107 1187
1108 xfs_buf_ioerror(bp, -error); 1188 xfs_buf_ioerror(bp, -error);
1109 1189
1190 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1191 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1192
1110 do { 1193 do {
1111 struct page *page = bvec->bv_page; 1194 struct page *page = bvec->bv_page;
1112 1195
@@ -1216,6 +1299,10 @@ next_chunk:
1216 1299
1217submit_io: 1300submit_io:
1218 if (likely(bio->bi_size)) { 1301 if (likely(bio->bi_size)) {
1302 if (xfs_buf_is_vmapped(bp)) {
1303 flush_kernel_vmap_range(bp->b_addr,
1304 xfs_buf_vmap_len(bp));
1305 }
1219 submit_bio(rw, bio); 1306 submit_bio(rw, bio);
1220 if (size) 1307 if (size)
1221 goto next_chunk; 1308 goto next_chunk;
@@ -1296,7 +1383,7 @@ xfs_buf_iomove(
1296 xfs_buf_t *bp, /* buffer to process */ 1383 xfs_buf_t *bp, /* buffer to process */
1297 size_t boff, /* starting buffer offset */ 1384 size_t boff, /* starting buffer offset */
1298 size_t bsize, /* length to copy */ 1385 size_t bsize, /* length to copy */
1299 caddr_t data, /* data address */ 1386 void *data, /* data address */
1300 xfs_buf_rw_t mode) /* read/write/zero flag */ 1387 xfs_buf_rw_t mode) /* read/write/zero flag */
1301{ 1388{
1302 size_t bend, cpoff, csize; 1389 size_t bend, cpoff, csize;
@@ -1378,8 +1465,8 @@ xfs_alloc_bufhash(
1378 1465
1379 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ 1466 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
1380 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; 1467 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
1381 btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * 1468 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
1382 sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE); 1469 sizeof(xfs_bufhash_t));
1383 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1470 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1384 spin_lock_init(&btp->bt_hash[i].bh_lock); 1471 spin_lock_init(&btp->bt_hash[i].bh_lock);
1385 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); 1472 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
@@ -1390,7 +1477,7 @@ STATIC void
1390xfs_free_bufhash( 1477xfs_free_bufhash(
1391 xfs_buftarg_t *btp) 1478 xfs_buftarg_t *btp)
1392{ 1479{
1393 kmem_free(btp->bt_hash); 1480 kmem_free_large(btp->bt_hash);
1394 btp->bt_hash = NULL; 1481 btp->bt_hash = NULL;
1395} 1482}
1396 1483
@@ -1595,6 +1682,11 @@ xfs_buf_delwri_queue(
1595 list_del(&bp->b_list); 1682 list_del(&bp->b_list);
1596 } 1683 }
1597 1684
1685 if (list_empty(dwq)) {
1686 /* start xfsbufd as it is about to have something to do */
1687 wake_up_process(bp->b_target->bt_task);
1688 }
1689
1598 bp->b_flags |= _XBF_DELWRI_Q; 1690 bp->b_flags |= _XBF_DELWRI_Q;
1599 list_add_tail(&bp->b_list, dwq); 1691 list_add_tail(&bp->b_list, dwq);
1600 bp->b_queuetime = jiffies; 1692 bp->b_queuetime = jiffies;
@@ -1626,6 +1718,35 @@ xfs_buf_delwri_dequeue(
1626 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); 1718 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
1627} 1719}
1628 1720
1721/*
1722 * If a delwri buffer needs to be pushed before it has aged out, then promote
1723 * it to the head of the delwri queue so that it will be flushed on the next
1724 * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
1725 * than the age currently needed to flush the buffer. Hence the next time the
1726 * xfsbufd sees it is guaranteed to be considered old enough to flush.
1727 */
1728void
1729xfs_buf_delwri_promote(
1730 struct xfs_buf *bp)
1731{
1732 struct xfs_buftarg *btp = bp->b_target;
1733 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
1734
1735 ASSERT(bp->b_flags & XBF_DELWRI);
1736 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1737
1738 /*
1739 * Check the buffer age before locking the delayed write queue as we
1740 * don't need to promote buffers that are already past the flush age.
1741 */
1742 if (bp->b_queuetime < jiffies - age)
1743 return;
1744 bp->b_queuetime = jiffies - age;
1745 spin_lock(&btp->bt_delwrite_lock);
1746 list_move(&bp->b_list, &btp->bt_delwrite_queue);
1747 spin_unlock(&btp->bt_delwrite_lock);
1748}
1749
1629STATIC void 1750STATIC void
1630xfs_buf_runall_queues( 1751xfs_buf_runall_queues(
1631 struct workqueue_struct *queue) 1752 struct workqueue_struct *queue)
@@ -1644,6 +1765,8 @@ xfsbufd_wakeup(
1644 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { 1765 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1645 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) 1766 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1646 continue; 1767 continue;
1768 if (list_empty(&btp->bt_delwrite_queue))
1769 continue;
1647 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); 1770 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1648 wake_up_process(btp->bt_task); 1771 wake_up_process(btp->bt_task);
1649 } 1772 }
@@ -1694,20 +1817,53 @@ xfs_buf_delwri_split(
1694 1817
1695} 1818}
1696 1819
1820/*
1821 * Compare function is more complex than it needs to be because
1822 * the return value is only 32 bits and we are doing comparisons
1823 * on 64 bit values
1824 */
1825static int
1826xfs_buf_cmp(
1827 void *priv,
1828 struct list_head *a,
1829 struct list_head *b)
1830{
1831 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
1832 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
1833 xfs_daddr_t diff;
1834
1835 diff = ap->b_bn - bp->b_bn;
1836 if (diff < 0)
1837 return -1;
1838 if (diff > 0)
1839 return 1;
1840 return 0;
1841}
1842
1843void
1844xfs_buf_delwri_sort(
1845 xfs_buftarg_t *target,
1846 struct list_head *list)
1847{
1848 list_sort(NULL, list, xfs_buf_cmp);
1849}
1850
1697STATIC int 1851STATIC int
1698xfsbufd( 1852xfsbufd(
1699 void *data) 1853 void *data)
1700{ 1854{
1701 struct list_head tmp; 1855 xfs_buftarg_t *target = (xfs_buftarg_t *)data;
1702 xfs_buftarg_t *target = (xfs_buftarg_t *)data;
1703 int count;
1704 xfs_buf_t *bp;
1705 1856
1706 current->flags |= PF_MEMALLOC; 1857 current->flags |= PF_MEMALLOC;
1707 1858
1708 set_freezable(); 1859 set_freezable();
1709 1860
1710 do { 1861 do {
1862 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
1863 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
1864 int count = 0;
1865 struct list_head tmp;
1866
1711 if (unlikely(freezing(current))) { 1867 if (unlikely(freezing(current))) {
1712 set_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1868 set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1713 refrigerator(); 1869 refrigerator();
@@ -1715,24 +1871,20 @@ xfsbufd(
1715 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1871 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1716 } 1872 }
1717 1873
1718 schedule_timeout_interruptible( 1874 /* sleep for a long time if there is nothing to do. */
1719 xfs_buf_timer_centisecs * msecs_to_jiffies(10)); 1875 if (list_empty(&target->bt_delwrite_queue))
1876 tout = MAX_SCHEDULE_TIMEOUT;
1877 schedule_timeout_interruptible(tout);
1720 1878
1721 xfs_buf_delwri_split(target, &tmp, 1879 xfs_buf_delwri_split(target, &tmp, age);
1722 xfs_buf_age_centisecs * msecs_to_jiffies(10)); 1880 list_sort(NULL, &tmp, xfs_buf_cmp);
1723
1724 count = 0;
1725 while (!list_empty(&tmp)) { 1881 while (!list_empty(&tmp)) {
1726 bp = list_entry(tmp.next, xfs_buf_t, b_list); 1882 struct xfs_buf *bp;
1727 ASSERT(target == bp->b_target); 1883 bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1728
1729 list_del_init(&bp->b_list); 1884 list_del_init(&bp->b_list);
1730 xfs_buf_iostrategy(bp); 1885 xfs_buf_iostrategy(bp);
1731 count++; 1886 count++;
1732 } 1887 }
1733
1734 if (as_list_len > 0)
1735 purge_addresses();
1736 if (count) 1888 if (count)
1737 blk_run_address_space(target->bt_mapping); 1889 blk_run_address_space(target->bt_mapping);
1738 1890
@@ -1751,42 +1903,45 @@ xfs_flush_buftarg(
1751 xfs_buftarg_t *target, 1903 xfs_buftarg_t *target,
1752 int wait) 1904 int wait)
1753{ 1905{
1754 struct list_head tmp; 1906 xfs_buf_t *bp;
1755 xfs_buf_t *bp, *n;
1756 int pincount = 0; 1907 int pincount = 0;
1908 LIST_HEAD(tmp_list);
1909 LIST_HEAD(wait_list);
1757 1910
1758 xfs_buf_runall_queues(xfsconvertd_workqueue); 1911 xfs_buf_runall_queues(xfsconvertd_workqueue);
1759 xfs_buf_runall_queues(xfsdatad_workqueue); 1912 xfs_buf_runall_queues(xfsdatad_workqueue);
1760 xfs_buf_runall_queues(xfslogd_workqueue); 1913 xfs_buf_runall_queues(xfslogd_workqueue);
1761 1914
1762 set_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1915 set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1763 pincount = xfs_buf_delwri_split(target, &tmp, 0); 1916 pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
1764 1917
1765 /* 1918 /*
1766 * Dropped the delayed write list lock, now walk the temporary list 1919 * Dropped the delayed write list lock, now walk the temporary list.
1920 * All I/O is issued async and then if we need to wait for completion
1921 * we do that after issuing all the IO.
1767 */ 1922 */
1768 list_for_each_entry_safe(bp, n, &tmp, b_list) { 1923 list_sort(NULL, &tmp_list, xfs_buf_cmp);
1924 while (!list_empty(&tmp_list)) {
1925 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
1769 ASSERT(target == bp->b_target); 1926 ASSERT(target == bp->b_target);
1770 if (wait) 1927 list_del_init(&bp->b_list);
1928 if (wait) {
1771 bp->b_flags &= ~XBF_ASYNC; 1929 bp->b_flags &= ~XBF_ASYNC;
1772 else 1930 list_add(&bp->b_list, &wait_list);
1773 list_del_init(&bp->b_list); 1931 }
1774
1775 xfs_buf_iostrategy(bp); 1932 xfs_buf_iostrategy(bp);
1776 } 1933 }
1777 1934
1778 if (wait) 1935 if (wait) {
1936 /* Expedite and wait for IO to complete. */
1779 blk_run_address_space(target->bt_mapping); 1937 blk_run_address_space(target->bt_mapping);
1938 while (!list_empty(&wait_list)) {
1939 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1780 1940
1781 /* 1941 list_del_init(&bp->b_list);
1782 * Remaining list items must be flushed before returning 1942 xfs_iowait(bp);
1783 */ 1943 xfs_buf_relse(bp);
1784 while (!list_empty(&tmp)) { 1944 }
1785 bp = list_entry(tmp.next, xfs_buf_t, b_list);
1786
1787 list_del_init(&bp->b_list);
1788 xfs_iowait(bp);
1789 xfs_buf_relse(bp);
1790 } 1945 }
1791 1946
1792 return pincount; 1947 return pincount;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a34c7b54822d..386e7361e50e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -232,13 +232,17 @@ extern void xfs_buf_lock(xfs_buf_t *);
232extern void xfs_buf_unlock(xfs_buf_t *); 232extern void xfs_buf_unlock(xfs_buf_t *);
233 233
234/* Buffer Read and Write Routines */ 234/* Buffer Read and Write Routines */
235extern int xfs_bawrite(void *mp, xfs_buf_t *bp); 235extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
236extern void xfs_bdwrite(void *mp, xfs_buf_t *bp); 236extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
237
238extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
239extern int xfs_bdstrat_cb(struct xfs_buf *);
240
237extern void xfs_buf_ioend(xfs_buf_t *, int); 241extern void xfs_buf_ioend(xfs_buf_t *, int);
238extern void xfs_buf_ioerror(xfs_buf_t *, int); 242extern void xfs_buf_ioerror(xfs_buf_t *, int);
239extern int xfs_buf_iorequest(xfs_buf_t *); 243extern int xfs_buf_iorequest(xfs_buf_t *);
240extern int xfs_buf_iowait(xfs_buf_t *); 244extern int xfs_buf_iowait(xfs_buf_t *);
241extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t, 245extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
242 xfs_buf_rw_t); 246 xfs_buf_rw_t);
243 247
244static inline int xfs_buf_iostrategy(xfs_buf_t *bp) 248static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
@@ -261,6 +265,7 @@ extern int xfs_buf_ispin(xfs_buf_t *);
261 265
262/* Delayed Write Buffer Routines */ 266/* Delayed Write Buffer Routines */
263extern void xfs_buf_delwri_dequeue(xfs_buf_t *); 267extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
268extern void xfs_buf_delwri_promote(xfs_buf_t *);
264 269
265/* Buffer Daemon Setup Routines */ 270/* Buffer Daemon Setup Routines */
266extern int xfs_buf_init(void); 271extern int xfs_buf_init(void);
@@ -270,33 +275,19 @@ extern void xfs_buf_terminate(void);
270 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) 275 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
271 276
272 277
273#define XFS_B_ASYNC XBF_ASYNC
274#define XFS_B_DELWRI XBF_DELWRI
275#define XFS_B_READ XBF_READ
276#define XFS_B_WRITE XBF_WRITE
277#define XFS_B_STALE XBF_STALE
278
279#define XFS_BUF_TRYLOCK XBF_TRYLOCK
280#define XFS_INCORE_TRYLOCK XBF_TRYLOCK
281#define XFS_BUF_LOCK XBF_LOCK
282#define XFS_BUF_MAPPED XBF_MAPPED
283
284#define BUF_BUSY XBF_DONT_BLOCK
285
286#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) 278#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags)
287#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ 279#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
288 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) 280 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
289 281
290#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE) 282#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE)
291#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE) 283#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
292#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XFS_B_STALE) 284#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
293#define XFS_BUF_SUPER_STALE(bp) do { \ 285#define XFS_BUF_SUPER_STALE(bp) do { \
294 XFS_BUF_STALE(bp); \ 286 XFS_BUF_STALE(bp); \
295 xfs_buf_delwri_dequeue(bp); \ 287 xfs_buf_delwri_dequeue(bp); \
296 XFS_BUF_DONE(bp); \ 288 XFS_BUF_DONE(bp); \
297 } while (0) 289 } while (0)
298 290
299#define XFS_BUF_MANAGE XBF_FS_MANAGED
300#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED) 291#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED)
301 292
302#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) 293#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
@@ -385,31 +376,11 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
385 376
386#define xfs_biomove(bp, off, len, data, rw) \ 377#define xfs_biomove(bp, off, len, data, rw) \
387 xfs_buf_iomove((bp), (off), (len), (data), \ 378 xfs_buf_iomove((bp), (off), (len), (data), \
388 ((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ) 379 ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
389 380
390#define xfs_biozero(bp, off, len) \ 381#define xfs_biozero(bp, off, len) \
391 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) 382 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
392 383
393
394static inline int XFS_bwrite(xfs_buf_t *bp)
395{
396 int iowait = (bp->b_flags & XBF_ASYNC) == 0;
397 int error = 0;
398
399 if (!iowait)
400 bp->b_flags |= _XBF_RUN_QUEUES;
401
402 xfs_buf_delwri_dequeue(bp);
403 xfs_buf_iostrategy(bp);
404 if (iowait) {
405 error = xfs_buf_iowait(bp);
406 xfs_buf_relse(bp);
407 }
408 return error;
409}
410
411#define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
412
413#define xfs_iowait(bp) xfs_buf_iowait(bp) 384#define xfs_iowait(bp) xfs_buf_iowait(bp)
414 385
415#define xfs_baread(target, rablkno, ralen) \ 386#define xfs_baread(target, rablkno, ralen) \
@@ -424,6 +395,7 @@ extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
424extern void xfs_wait_buftarg(xfs_buftarg_t *); 395extern void xfs_wait_buftarg(xfs_buftarg_t *);
425extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 396extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
426extern int xfs_flush_buftarg(xfs_buftarg_t *, int); 397extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
398
427#ifdef CONFIG_KDB_MODULES 399#ifdef CONFIG_KDB_MODULES
428extern struct list_head *xfs_get_buftarg_list(void); 400extern struct list_head *xfs_get_buftarg_list(void);
429#endif 401#endif
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 87b8cbd23d4b..846b75aeb2ab 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -29,6 +29,7 @@
29#include "xfs_vnodeops.h" 29#include "xfs_vnodeops.h"
30#include "xfs_bmap_btree.h" 30#include "xfs_bmap_btree.h"
31#include "xfs_inode.h" 31#include "xfs_inode.h"
32#include "xfs_inode_item.h"
32 33
33/* 34/*
34 * Note that we only accept fileids which are long enough rather than allow 35 * Note that we only accept fileids which are long enough rather than allow
@@ -215,9 +216,28 @@ xfs_fs_get_parent(
215 return d_obtain_alias(VFS_I(cip)); 216 return d_obtain_alias(VFS_I(cip));
216} 217}
217 218
219STATIC int
220xfs_fs_nfs_commit_metadata(
221 struct inode *inode)
222{
223 struct xfs_inode *ip = XFS_I(inode);
224 struct xfs_mount *mp = ip->i_mount;
225 int error = 0;
226
227 xfs_ilock(ip, XFS_ILOCK_SHARED);
228 if (xfs_ipincount(ip)) {
229 error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn,
230 XFS_LOG_SYNC, NULL);
231 }
232 xfs_iunlock(ip, XFS_ILOCK_SHARED);
233
234 return error;
235}
236
218const struct export_operations xfs_export_operations = { 237const struct export_operations xfs_export_operations = {
219 .encode_fh = xfs_fs_encode_fh, 238 .encode_fh = xfs_fs_encode_fh,
220 .fh_to_dentry = xfs_fs_fh_to_dentry, 239 .fh_to_dentry = xfs_fs_fh_to_dentry,
221 .fh_to_parent = xfs_fs_fh_to_parent, 240 .fh_to_parent = xfs_fs_fh_to_parent,
222 .get_parent = xfs_fs_get_parent, 241 .get_parent = xfs_fs_get_parent,
242 .commit_metadata = xfs_fs_nfs_commit_metadata,
223}; 243};
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e4caeb28ce2e..42dd3bcfba6b 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -16,6 +16,7 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h"
19#include "xfs_bit.h" 20#include "xfs_bit.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
21#include "xfs_inum.h" 22#include "xfs_inum.h"
@@ -34,52 +35,279 @@
34#include "xfs_dir2_sf.h" 35#include "xfs_dir2_sf.h"
35#include "xfs_dinode.h" 36#include "xfs_dinode.h"
36#include "xfs_inode.h" 37#include "xfs_inode.h"
38#include "xfs_inode_item.h"
39#include "xfs_bmap.h"
37#include "xfs_error.h" 40#include "xfs_error.h"
38#include "xfs_rw.h" 41#include "xfs_rw.h"
39#include "xfs_vnodeops.h" 42#include "xfs_vnodeops.h"
40#include "xfs_da_btree.h" 43#include "xfs_da_btree.h"
41#include "xfs_ioctl.h" 44#include "xfs_ioctl.h"
45#include "xfs_trace.h"
42 46
43#include <linux/dcache.h> 47#include <linux/dcache.h>
44 48
45static const struct vm_operations_struct xfs_file_vm_ops; 49static const struct vm_operations_struct xfs_file_vm_ops;
46 50
47STATIC ssize_t 51/*
48xfs_file_aio_read( 52 * xfs_iozero
49 struct kiocb *iocb, 53 *
50 const struct iovec *iov, 54 * xfs_iozero clears the specified range of buffer supplied,
51 unsigned long nr_segs, 55 * and marks all the affected blocks as valid and modified. If
52 loff_t pos) 56 * an affected block is not allocated, it will be allocated. If
57 * an affected block is not completely overwritten, and is not
58 * valid before the operation, it will be read from disk before
59 * being partially zeroed.
60 */
61STATIC int
62xfs_iozero(
63 struct xfs_inode *ip, /* inode */
64 loff_t pos, /* offset in file */
65 size_t count) /* size of data to zero */
53{ 66{
54 struct file *file = iocb->ki_filp; 67 struct page *page;
55 int ioflags = 0; 68 struct address_space *mapping;
69 int status;
56 70
57 BUG_ON(iocb->ki_pos != pos); 71 mapping = VFS_I(ip)->i_mapping;
58 if (unlikely(file->f_flags & O_DIRECT)) 72 do {
59 ioflags |= IO_ISDIRECT; 73 unsigned offset, bytes;
60 if (file->f_mode & FMODE_NOCMTIME) 74 void *fsdata;
61 ioflags |= IO_INVIS; 75
62 return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov, 76 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
63 nr_segs, &iocb->ki_pos, ioflags); 77 bytes = PAGE_CACHE_SIZE - offset;
78 if (bytes > count)
79 bytes = count;
80
81 status = pagecache_write_begin(NULL, mapping, pos, bytes,
82 AOP_FLAG_UNINTERRUPTIBLE,
83 &page, &fsdata);
84 if (status)
85 break;
86
87 zero_user(page, offset, bytes);
88
89 status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
90 page, fsdata);
91 WARN_ON(status <= 0); /* can't return less than zero! */
92 pos += bytes;
93 count -= bytes;
94 status = 0;
95 } while (count);
96
97 return (-status);
98}
99
100STATIC int
101xfs_file_fsync(
102 struct file *file,
103 struct dentry *dentry,
104 int datasync)
105{
106 struct xfs_inode *ip = XFS_I(dentry->d_inode);
107 struct xfs_trans *tp;
108 int error = 0;
109 int log_flushed = 0;
110
111 xfs_itrace_entry(ip);
112
113 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
114 return -XFS_ERROR(EIO);
115
116 xfs_iflags_clear(ip, XFS_ITRUNCATED);
117
118 /*
119 * We always need to make sure that the required inode state is safe on
120 * disk. The inode might be clean but we still might need to force the
121 * log because of committed transactions that haven't hit the disk yet.
122 * Likewise, there could be unflushed non-transactional changes to the
123 * inode core that have to go to disk and this requires us to issue
124 * a synchronous transaction to capture these changes correctly.
125 *
126 * This code relies on the assumption that if the i_update_core field
127 * of the inode is clear and the inode is unpinned then it is clean
128 * and no action is required.
129 */
130 xfs_ilock(ip, XFS_ILOCK_SHARED);
131
132 /*
133 * First check if the VFS inode is marked dirty. All the dirtying
134 * of non-transactional updates no goes through mark_inode_dirty*,
135 * which allows us to distinguish beteeen pure timestamp updates
136 * and i_size updates which need to be caught for fdatasync.
137 * After that also theck for the dirty state in the XFS inode, which
138 * might gets cleared when the inode gets written out via the AIL
139 * or xfs_iflush_cluster.
140 */
141 if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) ||
142 ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
143 ip->i_update_core) {
144 /*
145 * Kick off a transaction to log the inode core to get the
146 * updates. The sync transaction will also force the log.
147 */
148 xfs_iunlock(ip, XFS_ILOCK_SHARED);
149 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
150 error = xfs_trans_reserve(tp, 0,
151 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
152 if (error) {
153 xfs_trans_cancel(tp, 0);
154 return -error;
155 }
156 xfs_ilock(ip, XFS_ILOCK_EXCL);
157
158 /*
159 * Note - it's possible that we might have pushed ourselves out
160 * of the way during trans_reserve which would flush the inode.
161 * But there's no guarantee that the inode buffer has actually
162 * gone out yet (it's delwri). Plus the buffer could be pinned
163 * anyway if it's part of an inode in another recent
164 * transaction. So we play it safe and fire off the
165 * transaction anyway.
166 */
167 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
168 xfs_trans_ihold(tp, ip);
169 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
170 xfs_trans_set_sync(tp);
171 error = _xfs_trans_commit(tp, 0, &log_flushed);
172
173 xfs_iunlock(ip, XFS_ILOCK_EXCL);
174 } else {
175 /*
176 * Timestamps/size haven't changed since last inode flush or
177 * inode transaction commit. That means either nothing got
178 * written or a transaction committed which caught the updates.
179 * If the latter happened and the transaction hasn't hit the
180 * disk yet, the inode will be still be pinned. If it is,
181 * force the log.
182 */
183 if (xfs_ipincount(ip)) {
184 error = _xfs_log_force_lsn(ip->i_mount,
185 ip->i_itemp->ili_last_lsn,
186 XFS_LOG_SYNC, &log_flushed);
187 }
188 xfs_iunlock(ip, XFS_ILOCK_SHARED);
189 }
190
191 if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
192 /*
193 * If the log write didn't issue an ordered tag we need
194 * to flush the disk cache for the data device now.
195 */
196 if (!log_flushed)
197 xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
198
199 /*
200 * If this inode is on the RT dev we need to flush that
201 * cache as well.
202 */
203 if (XFS_IS_REALTIME_INODE(ip))
204 xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
205 }
206
207 return -error;
64} 208}
65 209
66STATIC ssize_t 210STATIC ssize_t
67xfs_file_aio_write( 211xfs_file_aio_read(
68 struct kiocb *iocb, 212 struct kiocb *iocb,
69 const struct iovec *iov, 213 const struct iovec *iovp,
70 unsigned long nr_segs, 214 unsigned long nr_segs,
71 loff_t pos) 215 loff_t pos)
72{ 216{
73 struct file *file = iocb->ki_filp; 217 struct file *file = iocb->ki_filp;
218 struct inode *inode = file->f_mapping->host;
219 struct xfs_inode *ip = XFS_I(inode);
220 struct xfs_mount *mp = ip->i_mount;
221 size_t size = 0;
222 ssize_t ret = 0;
74 int ioflags = 0; 223 int ioflags = 0;
224 xfs_fsize_t n;
225 unsigned long seg;
226
227 XFS_STATS_INC(xs_read_calls);
75 228
76 BUG_ON(iocb->ki_pos != pos); 229 BUG_ON(iocb->ki_pos != pos);
230
77 if (unlikely(file->f_flags & O_DIRECT)) 231 if (unlikely(file->f_flags & O_DIRECT))
78 ioflags |= IO_ISDIRECT; 232 ioflags |= IO_ISDIRECT;
79 if (file->f_mode & FMODE_NOCMTIME) 233 if (file->f_mode & FMODE_NOCMTIME)
80 ioflags |= IO_INVIS; 234 ioflags |= IO_INVIS;
81 return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs, 235
82 &iocb->ki_pos, ioflags); 236 /* START copy & waste from filemap.c */
237 for (seg = 0; seg < nr_segs; seg++) {
238 const struct iovec *iv = &iovp[seg];
239
240 /*
241 * If any segment has a negative length, or the cumulative
242 * length ever wraps negative then return -EINVAL.
243 */
244 size += iv->iov_len;
245 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
246 return XFS_ERROR(-EINVAL);
247 }
248 /* END copy & waste from filemap.c */
249
250 if (unlikely(ioflags & IO_ISDIRECT)) {
251 xfs_buftarg_t *target =
252 XFS_IS_REALTIME_INODE(ip) ?
253 mp->m_rtdev_targp : mp->m_ddev_targp;
254 if ((iocb->ki_pos & target->bt_smask) ||
255 (size & target->bt_smask)) {
256 if (iocb->ki_pos == ip->i_size)
257 return 0;
258 return -XFS_ERROR(EINVAL);
259 }
260 }
261
262 n = XFS_MAXIOFFSET(mp) - iocb->ki_pos;
263 if (n <= 0 || size == 0)
264 return 0;
265
266 if (n < size)
267 size = n;
268
269 if (XFS_FORCED_SHUTDOWN(mp))
270 return -EIO;
271
272 if (unlikely(ioflags & IO_ISDIRECT))
273 mutex_lock(&inode->i_mutex);
274 xfs_ilock(ip, XFS_IOLOCK_SHARED);
275
276 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
277 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
278 int iolock = XFS_IOLOCK_SHARED;
279
280 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size,
281 dmflags, &iolock);
282 if (ret) {
283 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
284 if (unlikely(ioflags & IO_ISDIRECT))
285 mutex_unlock(&inode->i_mutex);
286 return ret;
287 }
288 }
289
290 if (unlikely(ioflags & IO_ISDIRECT)) {
291 if (inode->i_mapping->nrpages) {
292 ret = -xfs_flushinval_pages(ip,
293 (iocb->ki_pos & PAGE_CACHE_MASK),
294 -1, FI_REMAPF_LOCKED);
295 }
296 mutex_unlock(&inode->i_mutex);
297 if (ret) {
298 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
299 return ret;
300 }
301 }
302
303 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
304
305 ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
306 if (ret > 0)
307 XFS_STATS_ADD(xs_read_bytes, ret);
308
309 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
310 return ret;
83} 311}
84 312
85STATIC ssize_t 313STATIC ssize_t
@@ -87,16 +315,44 @@ xfs_file_splice_read(
87 struct file *infilp, 315 struct file *infilp,
88 loff_t *ppos, 316 loff_t *ppos,
89 struct pipe_inode_info *pipe, 317 struct pipe_inode_info *pipe,
90 size_t len, 318 size_t count,
91 unsigned int flags) 319 unsigned int flags)
92{ 320{
321 struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
322 struct xfs_mount *mp = ip->i_mount;
93 int ioflags = 0; 323 int ioflags = 0;
324 ssize_t ret;
325
326 XFS_STATS_INC(xs_read_calls);
94 327
95 if (infilp->f_mode & FMODE_NOCMTIME) 328 if (infilp->f_mode & FMODE_NOCMTIME)
96 ioflags |= IO_INVIS; 329 ioflags |= IO_INVIS;
97 330
98 return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode), 331 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
99 infilp, ppos, pipe, len, flags, ioflags); 332 return -EIO;
333
334 xfs_ilock(ip, XFS_IOLOCK_SHARED);
335
336 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
337 int iolock = XFS_IOLOCK_SHARED;
338 int error;
339
340 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
341 FILP_DELAY_FLAG(infilp), &iolock);
342 if (error) {
343 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
344 return -error;
345 }
346 }
347
348 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
349
350 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
351 if (ret > 0)
352 XFS_STATS_ADD(xs_read_bytes, ret);
353
354 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
355 return ret;
100} 356}
101 357
102STATIC ssize_t 358STATIC ssize_t
@@ -104,16 +360,538 @@ xfs_file_splice_write(
104 struct pipe_inode_info *pipe, 360 struct pipe_inode_info *pipe,
105 struct file *outfilp, 361 struct file *outfilp,
106 loff_t *ppos, 362 loff_t *ppos,
107 size_t len, 363 size_t count,
108 unsigned int flags) 364 unsigned int flags)
109{ 365{
366 struct inode *inode = outfilp->f_mapping->host;
367 struct xfs_inode *ip = XFS_I(inode);
368 struct xfs_mount *mp = ip->i_mount;
369 xfs_fsize_t isize, new_size;
110 int ioflags = 0; 370 int ioflags = 0;
371 ssize_t ret;
372
373 XFS_STATS_INC(xs_write_calls);
111 374
112 if (outfilp->f_mode & FMODE_NOCMTIME) 375 if (outfilp->f_mode & FMODE_NOCMTIME)
113 ioflags |= IO_INVIS; 376 ioflags |= IO_INVIS;
114 377
115 return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode), 378 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
116 pipe, outfilp, ppos, len, flags, ioflags); 379 return -EIO;
380
381 xfs_ilock(ip, XFS_IOLOCK_EXCL);
382
383 if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
384 int iolock = XFS_IOLOCK_EXCL;
385 int error;
386
387 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
388 FILP_DELAY_FLAG(outfilp), &iolock);
389 if (error) {
390 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
391 return -error;
392 }
393 }
394
395 new_size = *ppos + count;
396
397 xfs_ilock(ip, XFS_ILOCK_EXCL);
398 if (new_size > ip->i_size)
399 ip->i_new_size = new_size;
400 xfs_iunlock(ip, XFS_ILOCK_EXCL);
401
402 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
403
404 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
405 if (ret > 0)
406 XFS_STATS_ADD(xs_write_bytes, ret);
407
408 isize = i_size_read(inode);
409 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
410 *ppos = isize;
411
412 if (*ppos > ip->i_size) {
413 xfs_ilock(ip, XFS_ILOCK_EXCL);
414 if (*ppos > ip->i_size)
415 ip->i_size = *ppos;
416 xfs_iunlock(ip, XFS_ILOCK_EXCL);
417 }
418
419 if (ip->i_new_size) {
420 xfs_ilock(ip, XFS_ILOCK_EXCL);
421 ip->i_new_size = 0;
422 if (ip->i_d.di_size > ip->i_size)
423 ip->i_d.di_size = ip->i_size;
424 xfs_iunlock(ip, XFS_ILOCK_EXCL);
425 }
426 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
427 return ret;
428}
429
430/*
431 * This routine is called to handle zeroing any space in the last
432 * block of the file that is beyond the EOF. We do this since the
433 * size is being increased without writing anything to that block
434 * and we don't want anyone to read the garbage on the disk.
435 */
436STATIC int /* error (positive) */
437xfs_zero_last_block(
438 xfs_inode_t *ip,
439 xfs_fsize_t offset,
440 xfs_fsize_t isize)
441{
442 xfs_fileoff_t last_fsb;
443 xfs_mount_t *mp = ip->i_mount;
444 int nimaps;
445 int zero_offset;
446 int zero_len;
447 int error = 0;
448 xfs_bmbt_irec_t imap;
449
450 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
451
452 zero_offset = XFS_B_FSB_OFFSET(mp, isize);
453 if (zero_offset == 0) {
454 /*
455 * There are no extra bytes in the last block on disk to
456 * zero, so return.
457 */
458 return 0;
459 }
460
461 last_fsb = XFS_B_TO_FSBT(mp, isize);
462 nimaps = 1;
463 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
464 &nimaps, NULL, NULL);
465 if (error) {
466 return error;
467 }
468 ASSERT(nimaps > 0);
469 /*
470 * If the block underlying isize is just a hole, then there
471 * is nothing to zero.
472 */
473 if (imap.br_startblock == HOLESTARTBLOCK) {
474 return 0;
475 }
476 /*
477 * Zero the part of the last block beyond the EOF, and write it
478 * out sync. We need to drop the ilock while we do this so we
479 * don't deadlock when the buffer cache calls back to us.
480 */
481 xfs_iunlock(ip, XFS_ILOCK_EXCL);
482
483 zero_len = mp->m_sb.sb_blocksize - zero_offset;
484 if (isize + zero_len > offset)
485 zero_len = offset - isize;
486 error = xfs_iozero(ip, isize, zero_len);
487
488 xfs_ilock(ip, XFS_ILOCK_EXCL);
489 ASSERT(error >= 0);
490 return error;
491}
492
493/*
494 * Zero any on disk space between the current EOF and the new,
495 * larger EOF. This handles the normal case of zeroing the remainder
496 * of the last block in the file and the unusual case of zeroing blocks
497 * out beyond the size of the file. This second case only happens
498 * with fixed size extents and when the system crashes before the inode
499 * size was updated but after blocks were allocated. If fill is set,
500 * then any holes in the range are filled and zeroed. If not, the holes
501 * are left alone as holes.
502 */
503
504int /* error (positive) */
505xfs_zero_eof(
506 xfs_inode_t *ip,
507 xfs_off_t offset, /* starting I/O offset */
508 xfs_fsize_t isize) /* current inode size */
509{
510 xfs_mount_t *mp = ip->i_mount;
511 xfs_fileoff_t start_zero_fsb;
512 xfs_fileoff_t end_zero_fsb;
513 xfs_fileoff_t zero_count_fsb;
514 xfs_fileoff_t last_fsb;
515 xfs_fileoff_t zero_off;
516 xfs_fsize_t zero_len;
517 int nimaps;
518 int error = 0;
519 xfs_bmbt_irec_t imap;
520
521 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
522 ASSERT(offset > isize);
523
524 /*
525 * First handle zeroing the block on which isize resides.
526 * We only zero a part of that block so it is handled specially.
527 */
528 error = xfs_zero_last_block(ip, offset, isize);
529 if (error) {
530 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
531 return error;
532 }
533
534 /*
535 * Calculate the range between the new size and the old
536 * where blocks needing to be zeroed may exist. To get the
537 * block where the last byte in the file currently resides,
538 * we need to subtract one from the size and truncate back
539 * to a block boundary. We subtract 1 in case the size is
540 * exactly on a block boundary.
541 */
542 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
543 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
544 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
545 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
546 if (last_fsb == end_zero_fsb) {
547 /*
548 * The size was only incremented on its last block.
549 * We took care of that above, so just return.
550 */
551 return 0;
552 }
553
554 ASSERT(start_zero_fsb <= end_zero_fsb);
555 while (start_zero_fsb <= end_zero_fsb) {
556 nimaps = 1;
557 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
558 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
559 0, NULL, 0, &imap, &nimaps, NULL, NULL);
560 if (error) {
561 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
562 return error;
563 }
564 ASSERT(nimaps > 0);
565
566 if (imap.br_state == XFS_EXT_UNWRITTEN ||
567 imap.br_startblock == HOLESTARTBLOCK) {
568 /*
569 * This loop handles initializing pages that were
570 * partially initialized by the code below this
571 * loop. It basically zeroes the part of the page
572 * that sits on a hole and sets the page as P_HOLE
573 * and calls remapf if it is a mapped file.
574 */
575 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
576 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
577 continue;
578 }
579
580 /*
581 * There are blocks we need to zero.
582 * Drop the inode lock while we're doing the I/O.
583 * We'll still have the iolock to protect us.
584 */
585 xfs_iunlock(ip, XFS_ILOCK_EXCL);
586
587 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
588 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
589
590 if ((zero_off + zero_len) > offset)
591 zero_len = offset - zero_off;
592
593 error = xfs_iozero(ip, zero_off, zero_len);
594 if (error) {
595 goto out_lock;
596 }
597
598 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
599 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
600
601 xfs_ilock(ip, XFS_ILOCK_EXCL);
602 }
603
604 return 0;
605
606out_lock:
607 xfs_ilock(ip, XFS_ILOCK_EXCL);
608 ASSERT(error >= 0);
609 return error;
610}
611
612STATIC ssize_t
613xfs_file_aio_write(
614 struct kiocb *iocb,
615 const struct iovec *iovp,
616 unsigned long nr_segs,
617 loff_t pos)
618{
619 struct file *file = iocb->ki_filp;
620 struct address_space *mapping = file->f_mapping;
621 struct inode *inode = mapping->host;
622 struct xfs_inode *ip = XFS_I(inode);
623 struct xfs_mount *mp = ip->i_mount;
624 ssize_t ret = 0, error = 0;
625 int ioflags = 0;
626 xfs_fsize_t isize, new_size;
627 int iolock;
628 int eventsent = 0;
629 size_t ocount = 0, count;
630 int need_i_mutex;
631
632 XFS_STATS_INC(xs_write_calls);
633
634 BUG_ON(iocb->ki_pos != pos);
635
636 if (unlikely(file->f_flags & O_DIRECT))
637 ioflags |= IO_ISDIRECT;
638 if (file->f_mode & FMODE_NOCMTIME)
639 ioflags |= IO_INVIS;
640
641 error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
642 if (error)
643 return error;
644
645 count = ocount;
646 if (count == 0)
647 return 0;
648
649 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
650
651 if (XFS_FORCED_SHUTDOWN(mp))
652 return -EIO;
653
654relock:
655 if (ioflags & IO_ISDIRECT) {
656 iolock = XFS_IOLOCK_SHARED;
657 need_i_mutex = 0;
658 } else {
659 iolock = XFS_IOLOCK_EXCL;
660 need_i_mutex = 1;
661 mutex_lock(&inode->i_mutex);
662 }
663
664 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
665
666start:
667 error = -generic_write_checks(file, &pos, &count,
668 S_ISBLK(inode->i_mode));
669 if (error) {
670 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
671 goto out_unlock_mutex;
672 }
673
674 if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
675 !(ioflags & IO_INVIS) && !eventsent)) {
676 int dmflags = FILP_DELAY_FLAG(file);
677
678 if (need_i_mutex)
679 dmflags |= DM_FLAGS_IMUX;
680
681 xfs_iunlock(ip, XFS_ILOCK_EXCL);
682 error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
683 pos, count, dmflags, &iolock);
684 if (error) {
685 goto out_unlock_internal;
686 }
687 xfs_ilock(ip, XFS_ILOCK_EXCL);
688 eventsent = 1;
689
690 /*
691 * The iolock was dropped and reacquired in XFS_SEND_DATA
692 * so we have to recheck the size when appending.
693 * We will only "goto start;" once, since having sent the
694 * event prevents another call to XFS_SEND_DATA, which is
695 * what allows the size to change in the first place.
696 */
697 if ((file->f_flags & O_APPEND) && pos != ip->i_size)
698 goto start;
699 }
700
701 if (ioflags & IO_ISDIRECT) {
702 xfs_buftarg_t *target =
703 XFS_IS_REALTIME_INODE(ip) ?
704 mp->m_rtdev_targp : mp->m_ddev_targp;
705
706 if ((pos & target->bt_smask) || (count & target->bt_smask)) {
707 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
708 return XFS_ERROR(-EINVAL);
709 }
710
711 if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
712 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
713 iolock = XFS_IOLOCK_EXCL;
714 need_i_mutex = 1;
715 mutex_lock(&inode->i_mutex);
716 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
717 goto start;
718 }
719 }
720
721 new_size = pos + count;
722 if (new_size > ip->i_size)
723 ip->i_new_size = new_size;
724
725 if (likely(!(ioflags & IO_INVIS)))
726 file_update_time(file);
727
728 /*
729 * If the offset is beyond the size of the file, we have a couple
730 * of things to do. First, if there is already space allocated
731 * we need to either create holes or zero the disk or ...
732 *
733 * If there is a page where the previous size lands, we need
734 * to zero it out up to the new size.
735 */
736
737 if (pos > ip->i_size) {
738 error = xfs_zero_eof(ip, pos, ip->i_size);
739 if (error) {
740 xfs_iunlock(ip, XFS_ILOCK_EXCL);
741 goto out_unlock_internal;
742 }
743 }
744 xfs_iunlock(ip, XFS_ILOCK_EXCL);
745
746 /*
747 * If we're writing the file then make sure to clear the
748 * setuid and setgid bits if the process is not being run
749 * by root. This keeps people from modifying setuid and
750 * setgid binaries.
751 */
752 error = -file_remove_suid(file);
753 if (unlikely(error))
754 goto out_unlock_internal;
755
756 /* We can write back this queue in page reclaim */
757 current->backing_dev_info = mapping->backing_dev_info;
758
759 if ((ioflags & IO_ISDIRECT)) {
760 if (mapping->nrpages) {
761 WARN_ON(need_i_mutex == 0);
762 error = xfs_flushinval_pages(ip,
763 (pos & PAGE_CACHE_MASK),
764 -1, FI_REMAPF_LOCKED);
765 if (error)
766 goto out_unlock_internal;
767 }
768
769 if (need_i_mutex) {
770 /* demote the lock now the cached pages are gone */
771 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
772 mutex_unlock(&inode->i_mutex);
773
774 iolock = XFS_IOLOCK_SHARED;
775 need_i_mutex = 0;
776 }
777
778 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
779 ret = generic_file_direct_write(iocb, iovp,
780 &nr_segs, pos, &iocb->ki_pos, count, ocount);
781
782 /*
783 * direct-io write to a hole: fall through to buffered I/O
784 * for completing the rest of the request.
785 */
786 if (ret >= 0 && ret != count) {
787 XFS_STATS_ADD(xs_write_bytes, ret);
788
789 pos += ret;
790 count -= ret;
791
792 ioflags &= ~IO_ISDIRECT;
793 xfs_iunlock(ip, iolock);
794 goto relock;
795 }
796 } else {
797 int enospc = 0;
798 ssize_t ret2 = 0;
799
800write_retry:
801 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
802 ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
803 pos, &iocb->ki_pos, count, ret);
804 /*
805 * if we just got an ENOSPC, flush the inode now we
806 * aren't holding any page locks and retry *once*
807 */
808 if (ret2 == -ENOSPC && !enospc) {
809 error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
810 if (error)
811 goto out_unlock_internal;
812 enospc = 1;
813 goto write_retry;
814 }
815 ret = ret2;
816 }
817
818 current->backing_dev_info = NULL;
819
820 isize = i_size_read(inode);
821 if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
822 iocb->ki_pos = isize;
823
824 if (iocb->ki_pos > ip->i_size) {
825 xfs_ilock(ip, XFS_ILOCK_EXCL);
826 if (iocb->ki_pos > ip->i_size)
827 ip->i_size = iocb->ki_pos;
828 xfs_iunlock(ip, XFS_ILOCK_EXCL);
829 }
830
831 if (ret == -ENOSPC &&
832 DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
833 xfs_iunlock(ip, iolock);
834 if (need_i_mutex)
835 mutex_unlock(&inode->i_mutex);
836 error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
837 DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
838 0, 0, 0); /* Delay flag intentionally unused */
839 if (need_i_mutex)
840 mutex_lock(&inode->i_mutex);
841 xfs_ilock(ip, iolock);
842 if (error)
843 goto out_unlock_internal;
844 goto start;
845 }
846
847 error = -ret;
848 if (ret <= 0)
849 goto out_unlock_internal;
850
851 XFS_STATS_ADD(xs_write_bytes, ret);
852
853 /* Handle various SYNC-type writes */
854 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
855 loff_t end = pos + ret - 1;
856 int error2;
857
858 xfs_iunlock(ip, iolock);
859 if (need_i_mutex)
860 mutex_unlock(&inode->i_mutex);
861
862 error2 = filemap_write_and_wait_range(mapping, pos, end);
863 if (!error)
864 error = error2;
865 if (need_i_mutex)
866 mutex_lock(&inode->i_mutex);
867 xfs_ilock(ip, iolock);
868
869 error2 = -xfs_file_fsync(file, file->f_path.dentry,
870 (file->f_flags & __O_SYNC) ? 0 : 1);
871 if (!error)
872 error = error2;
873 }
874
875 out_unlock_internal:
876 if (ip->i_new_size) {
877 xfs_ilock(ip, XFS_ILOCK_EXCL);
878 ip->i_new_size = 0;
879 /*
880 * If this was a direct or synchronous I/O that failed (such
881 * as ENOSPC) then part of the I/O may have been written to
882 * disk before the error occured. In this case the on-disk
883 * file size may have been adjusted beyond the in-memory file
884 * size and now needs to be truncated back.
885 */
886 if (ip->i_d.di_size > ip->i_size)
887 ip->i_d.di_size = ip->i_size;
888 xfs_iunlock(ip, XFS_ILOCK_EXCL);
889 }
890 xfs_iunlock(ip, iolock);
891 out_unlock_mutex:
892 if (need_i_mutex)
893 mutex_unlock(&inode->i_mutex);
894 return -error;
117} 895}
118 896
119STATIC int 897STATIC int
@@ -160,28 +938,6 @@ xfs_file_release(
160 return -xfs_release(XFS_I(inode)); 938 return -xfs_release(XFS_I(inode));
161} 939}
162 940
163/*
164 * We ignore the datasync flag here because a datasync is effectively
165 * identical to an fsync. That is, datasync implies that we need to write
166 * only the metadata needed to be able to access the data that is written
167 * if we crash after the call completes. Hence if we are writing beyond
168 * EOF we have to log the inode size change as well, which makes it a
169 * full fsync. If we don't write beyond EOF, the inode core will be
170 * clean in memory and so we don't need to log the inode, just like
171 * fsync.
172 */
173STATIC int
174xfs_file_fsync(
175 struct file *file,
176 struct dentry *dentry,
177 int datasync)
178{
179 struct xfs_inode *ip = XFS_I(dentry->d_inode);
180
181 xfs_iflags_clear(ip, XFS_ITRUNCATED);
182 return -xfs_fsync(ip);
183}
184
185STATIC int 941STATIC int
186xfs_file_readdir( 942xfs_file_readdir(
187 struct file *filp, 943 struct file *filp,
@@ -203,9 +959,9 @@ xfs_file_readdir(
203 * 959 *
204 * Try to give it an estimate that's good enough, maybe at some 960 * Try to give it an estimate that's good enough, maybe at some
205 * point we can change the ->readdir prototype to include the 961 * point we can change the ->readdir prototype to include the
206 * buffer size. 962 * buffer size. For now we use the current glibc buffer size.
207 */ 963 */
208 bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size); 964 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
209 965
210 error = xfs_readdir(ip, dirent, bufsize, 966 error = xfs_readdir(ip, dirent, bufsize,
211 (xfs_off_t *)&filp->f_pos, filldir); 967 (xfs_off_t *)&filp->f_pos, filldir);
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 7501b85fd860..b6918d76bc7b 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -79,7 +79,7 @@ xfs_flush_pages(
79 xfs_iflags_clear(ip, XFS_ITRUNCATED); 79 xfs_iflags_clear(ip, XFS_ITRUNCATED);
80 ret = -filemap_fdatawrite(mapping); 80 ret = -filemap_fdatawrite(mapping);
81 } 81 }
82 if (flags & XFS_B_ASYNC) 82 if (flags & XBF_ASYNC)
83 return ret; 83 return ret;
84 ret2 = xfs_wait_on_pages(ip, first, last); 84 ret2 = xfs_wait_on_pages(ip, first, last);
85 if (!ret) 85 if (!ret)
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index a034cf624437..7b26cc2fd284 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -58,6 +58,7 @@
58#include <linux/mount.h> 58#include <linux/mount.h>
59#include <linux/namei.h> 59#include <linux/namei.h>
60#include <linux/pagemap.h> 60#include <linux/pagemap.h>
61#include <linux/slab.h>
61#include <linux/exportfs.h> 62#include <linux/exportfs.h>
62 63
63/* 64/*
@@ -447,12 +448,12 @@ xfs_attrlist_by_handle(
447int 448int
448xfs_attrmulti_attr_get( 449xfs_attrmulti_attr_get(
449 struct inode *inode, 450 struct inode *inode,
450 char *name, 451 unsigned char *name,
451 char __user *ubuf, 452 unsigned char __user *ubuf,
452 __uint32_t *len, 453 __uint32_t *len,
453 __uint32_t flags) 454 __uint32_t flags)
454{ 455{
455 char *kbuf; 456 unsigned char *kbuf;
456 int error = EFAULT; 457 int error = EFAULT;
457 458
458 if (*len > XATTR_SIZE_MAX) 459 if (*len > XATTR_SIZE_MAX)
@@ -476,12 +477,12 @@ xfs_attrmulti_attr_get(
476int 477int
477xfs_attrmulti_attr_set( 478xfs_attrmulti_attr_set(
478 struct inode *inode, 479 struct inode *inode,
479 char *name, 480 unsigned char *name,
480 const char __user *ubuf, 481 const unsigned char __user *ubuf,
481 __uint32_t len, 482 __uint32_t len,
482 __uint32_t flags) 483 __uint32_t flags)
483{ 484{
484 char *kbuf; 485 unsigned char *kbuf;
485 int error = EFAULT; 486 int error = EFAULT;
486 487
487 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 488 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -501,7 +502,7 @@ xfs_attrmulti_attr_set(
501int 502int
502xfs_attrmulti_attr_remove( 503xfs_attrmulti_attr_remove(
503 struct inode *inode, 504 struct inode *inode,
504 char *name, 505 unsigned char *name,
505 __uint32_t flags) 506 __uint32_t flags)
506{ 507{
507 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 508 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -519,7 +520,7 @@ xfs_attrmulti_by_handle(
519 xfs_fsop_attrmulti_handlereq_t am_hreq; 520 xfs_fsop_attrmulti_handlereq_t am_hreq;
520 struct dentry *dentry; 521 struct dentry *dentry;
521 unsigned int i, size; 522 unsigned int i, size;
522 char *attr_name; 523 unsigned char *attr_name;
523 524
524 if (!capable(CAP_SYS_ADMIN)) 525 if (!capable(CAP_SYS_ADMIN))
525 return -XFS_ERROR(EPERM); 526 return -XFS_ERROR(EPERM);
@@ -547,7 +548,7 @@ xfs_attrmulti_by_handle(
547 548
548 error = 0; 549 error = 0;
549 for (i = 0; i < am_hreq.opcount; i++) { 550 for (i = 0; i < am_hreq.opcount; i++) {
550 ops[i].am_error = strncpy_from_user(attr_name, 551 ops[i].am_error = strncpy_from_user((char *)attr_name,
551 ops[i].am_attrname, MAXNAMELEN); 552 ops[i].am_attrname, MAXNAMELEN);
552 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) 553 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
553 error = -ERANGE; 554 error = -ERANGE;
@@ -1431,6 +1432,9 @@ xfs_file_ioctl(
1431 if (!capable(CAP_SYS_ADMIN)) 1432 if (!capable(CAP_SYS_ADMIN))
1432 return -EPERM; 1433 return -EPERM;
1433 1434
1435 if (mp->m_flags & XFS_MOUNT_RDONLY)
1436 return -XFS_ERROR(EROFS);
1437
1434 if (copy_from_user(&inout, arg, sizeof(inout))) 1438 if (copy_from_user(&inout, arg, sizeof(inout)))
1435 return -XFS_ERROR(EFAULT); 1439 return -XFS_ERROR(EFAULT);
1436 1440
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
index 7bd7c6afc1eb..d56173b34a2a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -45,23 +45,23 @@ xfs_readlink_by_handle(
45extern int 45extern int
46xfs_attrmulti_attr_get( 46xfs_attrmulti_attr_get(
47 struct inode *inode, 47 struct inode *inode,
48 char *name, 48 unsigned char *name,
49 char __user *ubuf, 49 unsigned char __user *ubuf,
50 __uint32_t *len, 50 __uint32_t *len,
51 __uint32_t flags); 51 __uint32_t flags);
52 52
53extern int 53extern int
54 xfs_attrmulti_attr_set( 54xfs_attrmulti_attr_set(
55 struct inode *inode, 55 struct inode *inode,
56 char *name, 56 unsigned char *name,
57 const char __user *ubuf, 57 const unsigned char __user *ubuf,
58 __uint32_t len, 58 __uint32_t len,
59 __uint32_t flags); 59 __uint32_t flags);
60 60
61extern int 61extern int
62xfs_attrmulti_attr_remove( 62xfs_attrmulti_attr_remove(
63 struct inode *inode, 63 struct inode *inode,
64 char *name, 64 unsigned char *name,
65 __uint32_t flags); 65 __uint32_t flags);
66 66
67extern struct dentry * 67extern struct dentry *
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index be1527b1670c..593c05b4df8d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -18,6 +18,7 @@
18#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/ioctl.h> 19#include <linux/ioctl.h>
20#include <linux/mount.h> 20#include <linux/mount.h>
21#include <linux/slab.h>
21#include <asm/uaccess.h> 22#include <asm/uaccess.h>
22#include "xfs.h" 23#include "xfs.h"
23#include "xfs_fs.h" 24#include "xfs_fs.h"
@@ -411,7 +412,7 @@ xfs_compat_attrmulti_by_handle(
411 compat_xfs_fsop_attrmulti_handlereq_t am_hreq; 412 compat_xfs_fsop_attrmulti_handlereq_t am_hreq;
412 struct dentry *dentry; 413 struct dentry *dentry;
413 unsigned int i, size; 414 unsigned int i, size;
414 char *attr_name; 415 unsigned char *attr_name;
415 416
416 if (!capable(CAP_SYS_ADMIN)) 417 if (!capable(CAP_SYS_ADMIN))
417 return -XFS_ERROR(EPERM); 418 return -XFS_ERROR(EPERM);
@@ -440,7 +441,7 @@ xfs_compat_attrmulti_by_handle(
440 441
441 error = 0; 442 error = 0;
442 for (i = 0; i < am_hreq.opcount; i++) { 443 for (i = 0; i < am_hreq.opcount; i++) {
443 ops[i].am_error = strncpy_from_user(attr_name, 444 ops[i].am_error = strncpy_from_user((char *)attr_name,
444 compat_ptr(ops[i].am_attrname), 445 compat_ptr(ops[i].am_attrname),
445 MAXNAMELEN); 446 MAXNAMELEN);
446 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) 447 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 225946012d0b..e65a7937f3a4 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -56,6 +56,7 @@
56#include <linux/security.h> 56#include <linux/security.h>
57#include <linux/falloc.h> 57#include <linux/falloc.h>
58#include <linux/fiemap.h> 58#include <linux/fiemap.h>
59#include <linux/slab.h>
59 60
60/* 61/*
61 * Bring the timestamps in the XFS inode uptodate. 62 * Bring the timestamps in the XFS inode uptodate.
@@ -91,6 +92,16 @@ xfs_mark_inode_dirty_sync(
91 mark_inode_dirty_sync(inode); 92 mark_inode_dirty_sync(inode);
92} 93}
93 94
95void
96xfs_mark_inode_dirty(
97 xfs_inode_t *ip)
98{
99 struct inode *inode = VFS_I(ip);
100
101 if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
102 mark_inode_dirty(inode);
103}
104
94/* 105/*
95 * Change the requested timestamp in the given inode. 106 * Change the requested timestamp in the given inode.
96 * We don't lock across timestamp updates, and we don't log them but 107 * We don't lock across timestamp updates, and we don't log them but
@@ -140,10 +151,10 @@ xfs_init_security(
140 struct xfs_inode *ip = XFS_I(inode); 151 struct xfs_inode *ip = XFS_I(inode);
141 size_t length; 152 size_t length;
142 void *value; 153 void *value;
143 char *name; 154 unsigned char *name;
144 int error; 155 int error;
145 156
146 error = security_inode_init_security(inode, dir, &name, 157 error = security_inode_init_security(inode, dir, (char **)&name,
147 &value, &length); 158 &value, &length);
148 if (error) { 159 if (error) {
149 if (error == -EOPNOTSUPP) 160 if (error == -EOPNOTSUPP)
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 5af0c81ca1ae..facfb323a706 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -88,7 +88,6 @@
88#include <xfs_super.h> 88#include <xfs_super.h>
89#include <xfs_globals.h> 89#include <xfs_globals.h>
90#include <xfs_fs_subr.h> 90#include <xfs_fs_subr.h>
91#include <xfs_lrw.h>
92#include <xfs_buf.h> 91#include <xfs_buf.h>
93 92
94/* 93/*
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
deleted file mode 100644
index 0d32457abef1..000000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ /dev/null
@@ -1,852 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_bit.h"
21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h"
30#include "xfs_mount.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h"
38#include "xfs_bmap.h"
39#include "xfs_btree.h"
40#include "xfs_ialloc.h"
41#include "xfs_rtalloc.h"
42#include "xfs_error.h"
43#include "xfs_itable.h"
44#include "xfs_rw.h"
45#include "xfs_attr.h"
46#include "xfs_inode_item.h"
47#include "xfs_buf_item.h"
48#include "xfs_utils.h"
49#include "xfs_iomap.h"
50#include "xfs_vnodeops.h"
51#include "xfs_trace.h"
52
53#include <linux/capability.h>
54#include <linux/writeback.h>
55
56
57/*
58 * xfs_iozero
59 *
60 * xfs_iozero clears the specified range of buffer supplied,
61 * and marks all the affected blocks as valid and modified. If
62 * an affected block is not allocated, it will be allocated. If
63 * an affected block is not completely overwritten, and is not
64 * valid before the operation, it will be read from disk before
65 * being partially zeroed.
66 */
67STATIC int
68xfs_iozero(
69 struct xfs_inode *ip, /* inode */
70 loff_t pos, /* offset in file */
71 size_t count) /* size of data to zero */
72{
73 struct page *page;
74 struct address_space *mapping;
75 int status;
76
77 mapping = VFS_I(ip)->i_mapping;
78 do {
79 unsigned offset, bytes;
80 void *fsdata;
81
82 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
83 bytes = PAGE_CACHE_SIZE - offset;
84 if (bytes > count)
85 bytes = count;
86
87 status = pagecache_write_begin(NULL, mapping, pos, bytes,
88 AOP_FLAG_UNINTERRUPTIBLE,
89 &page, &fsdata);
90 if (status)
91 break;
92
93 zero_user(page, offset, bytes);
94
95 status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
96 page, fsdata);
97 WARN_ON(status <= 0); /* can't return less than zero! */
98 pos += bytes;
99 count -= bytes;
100 status = 0;
101 } while (count);
102
103 return (-status);
104}
105
106ssize_t /* bytes read, or (-) error */
107xfs_read(
108 xfs_inode_t *ip,
109 struct kiocb *iocb,
110 const struct iovec *iovp,
111 unsigned int segs,
112 loff_t *offset,
113 int ioflags)
114{
115 struct file *file = iocb->ki_filp;
116 struct inode *inode = file->f_mapping->host;
117 xfs_mount_t *mp = ip->i_mount;
118 size_t size = 0;
119 ssize_t ret = 0;
120 xfs_fsize_t n;
121 unsigned long seg;
122
123
124 XFS_STATS_INC(xs_read_calls);
125
126 /* START copy & waste from filemap.c */
127 for (seg = 0; seg < segs; seg++) {
128 const struct iovec *iv = &iovp[seg];
129
130 /*
131 * If any segment has a negative length, or the cumulative
132 * length ever wraps negative then return -EINVAL.
133 */
134 size += iv->iov_len;
135 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
136 return XFS_ERROR(-EINVAL);
137 }
138 /* END copy & waste from filemap.c */
139
140 if (unlikely(ioflags & IO_ISDIRECT)) {
141 xfs_buftarg_t *target =
142 XFS_IS_REALTIME_INODE(ip) ?
143 mp->m_rtdev_targp : mp->m_ddev_targp;
144 if ((*offset & target->bt_smask) ||
145 (size & target->bt_smask)) {
146 if (*offset == ip->i_size) {
147 return (0);
148 }
149 return -XFS_ERROR(EINVAL);
150 }
151 }
152
153 n = XFS_MAXIOFFSET(mp) - *offset;
154 if ((n <= 0) || (size == 0))
155 return 0;
156
157 if (n < size)
158 size = n;
159
160 if (XFS_FORCED_SHUTDOWN(mp))
161 return -EIO;
162
163 if (unlikely(ioflags & IO_ISDIRECT))
164 mutex_lock(&inode->i_mutex);
165 xfs_ilock(ip, XFS_IOLOCK_SHARED);
166
167 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
168 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
169 int iolock = XFS_IOLOCK_SHARED;
170
171 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
172 dmflags, &iolock);
173 if (ret) {
174 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
175 if (unlikely(ioflags & IO_ISDIRECT))
176 mutex_unlock(&inode->i_mutex);
177 return ret;
178 }
179 }
180
181 if (unlikely(ioflags & IO_ISDIRECT)) {
182 if (inode->i_mapping->nrpages)
183 ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
184 -1, FI_REMAPF_LOCKED);
185 mutex_unlock(&inode->i_mutex);
186 if (ret) {
187 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
188 return ret;
189 }
190 }
191
192 trace_xfs_file_read(ip, size, *offset, ioflags);
193
194 iocb->ki_pos = *offset;
195 ret = generic_file_aio_read(iocb, iovp, segs, *offset);
196 if (ret > 0)
197 XFS_STATS_ADD(xs_read_bytes, ret);
198
199 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
200 return ret;
201}
202
203ssize_t
204xfs_splice_read(
205 xfs_inode_t *ip,
206 struct file *infilp,
207 loff_t *ppos,
208 struct pipe_inode_info *pipe,
209 size_t count,
210 int flags,
211 int ioflags)
212{
213 xfs_mount_t *mp = ip->i_mount;
214 ssize_t ret;
215
216 XFS_STATS_INC(xs_read_calls);
217 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
218 return -EIO;
219
220 xfs_ilock(ip, XFS_IOLOCK_SHARED);
221
222 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
223 int iolock = XFS_IOLOCK_SHARED;
224 int error;
225
226 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
227 FILP_DELAY_FLAG(infilp), &iolock);
228 if (error) {
229 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
230 return -error;
231 }
232 }
233
234 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
235
236 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
237 if (ret > 0)
238 XFS_STATS_ADD(xs_read_bytes, ret);
239
240 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
241 return ret;
242}
243
244ssize_t
245xfs_splice_write(
246 xfs_inode_t *ip,
247 struct pipe_inode_info *pipe,
248 struct file *outfilp,
249 loff_t *ppos,
250 size_t count,
251 int flags,
252 int ioflags)
253{
254 xfs_mount_t *mp = ip->i_mount;
255 ssize_t ret;
256 struct inode *inode = outfilp->f_mapping->host;
257 xfs_fsize_t isize, new_size;
258
259 XFS_STATS_INC(xs_write_calls);
260 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
261 return -EIO;
262
263 xfs_ilock(ip, XFS_IOLOCK_EXCL);
264
265 if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
266 int iolock = XFS_IOLOCK_EXCL;
267 int error;
268
269 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
270 FILP_DELAY_FLAG(outfilp), &iolock);
271 if (error) {
272 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
273 return -error;
274 }
275 }
276
277 new_size = *ppos + count;
278
279 xfs_ilock(ip, XFS_ILOCK_EXCL);
280 if (new_size > ip->i_size)
281 ip->i_new_size = new_size;
282 xfs_iunlock(ip, XFS_ILOCK_EXCL);
283
284 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
285
286 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
287 if (ret > 0)
288 XFS_STATS_ADD(xs_write_bytes, ret);
289
290 isize = i_size_read(inode);
291 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
292 *ppos = isize;
293
294 if (*ppos > ip->i_size) {
295 xfs_ilock(ip, XFS_ILOCK_EXCL);
296 if (*ppos > ip->i_size)
297 ip->i_size = *ppos;
298 xfs_iunlock(ip, XFS_ILOCK_EXCL);
299 }
300
301 if (ip->i_new_size) {
302 xfs_ilock(ip, XFS_ILOCK_EXCL);
303 ip->i_new_size = 0;
304 if (ip->i_d.di_size > ip->i_size)
305 ip->i_d.di_size = ip->i_size;
306 xfs_iunlock(ip, XFS_ILOCK_EXCL);
307 }
308 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
309 return ret;
310}
311
312/*
313 * This routine is called to handle zeroing any space in the last
314 * block of the file that is beyond the EOF. We do this since the
315 * size is being increased without writing anything to that block
316 * and we don't want anyone to read the garbage on the disk.
317 */
318STATIC int /* error (positive) */
319xfs_zero_last_block(
320 xfs_inode_t *ip,
321 xfs_fsize_t offset,
322 xfs_fsize_t isize)
323{
324 xfs_fileoff_t last_fsb;
325 xfs_mount_t *mp = ip->i_mount;
326 int nimaps;
327 int zero_offset;
328 int zero_len;
329 int error = 0;
330 xfs_bmbt_irec_t imap;
331
332 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
333
334 zero_offset = XFS_B_FSB_OFFSET(mp, isize);
335 if (zero_offset == 0) {
336 /*
337 * There are no extra bytes in the last block on disk to
338 * zero, so return.
339 */
340 return 0;
341 }
342
343 last_fsb = XFS_B_TO_FSBT(mp, isize);
344 nimaps = 1;
345 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
346 &nimaps, NULL, NULL);
347 if (error) {
348 return error;
349 }
350 ASSERT(nimaps > 0);
351 /*
352 * If the block underlying isize is just a hole, then there
353 * is nothing to zero.
354 */
355 if (imap.br_startblock == HOLESTARTBLOCK) {
356 return 0;
357 }
358 /*
359 * Zero the part of the last block beyond the EOF, and write it
360 * out sync. We need to drop the ilock while we do this so we
361 * don't deadlock when the buffer cache calls back to us.
362 */
363 xfs_iunlock(ip, XFS_ILOCK_EXCL);
364
365 zero_len = mp->m_sb.sb_blocksize - zero_offset;
366 if (isize + zero_len > offset)
367 zero_len = offset - isize;
368 error = xfs_iozero(ip, isize, zero_len);
369
370 xfs_ilock(ip, XFS_ILOCK_EXCL);
371 ASSERT(error >= 0);
372 return error;
373}
374
375/*
376 * Zero any on disk space between the current EOF and the new,
377 * larger EOF. This handles the normal case of zeroing the remainder
378 * of the last block in the file and the unusual case of zeroing blocks
379 * out beyond the size of the file. This second case only happens
380 * with fixed size extents and when the system crashes before the inode
381 * size was updated but after blocks were allocated. If fill is set,
382 * then any holes in the range are filled and zeroed. If not, the holes
383 * are left alone as holes.
384 */
385
386int /* error (positive) */
387xfs_zero_eof(
388 xfs_inode_t *ip,
389 xfs_off_t offset, /* starting I/O offset */
390 xfs_fsize_t isize) /* current inode size */
391{
392 xfs_mount_t *mp = ip->i_mount;
393 xfs_fileoff_t start_zero_fsb;
394 xfs_fileoff_t end_zero_fsb;
395 xfs_fileoff_t zero_count_fsb;
396 xfs_fileoff_t last_fsb;
397 xfs_fileoff_t zero_off;
398 xfs_fsize_t zero_len;
399 int nimaps;
400 int error = 0;
401 xfs_bmbt_irec_t imap;
402
403 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
404 ASSERT(offset > isize);
405
406 /*
407 * First handle zeroing the block on which isize resides.
408 * We only zero a part of that block so it is handled specially.
409 */
410 error = xfs_zero_last_block(ip, offset, isize);
411 if (error) {
412 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
413 return error;
414 }
415
416 /*
417 * Calculate the range between the new size and the old
418 * where blocks needing to be zeroed may exist. To get the
419 * block where the last byte in the file currently resides,
420 * we need to subtract one from the size and truncate back
421 * to a block boundary. We subtract 1 in case the size is
422 * exactly on a block boundary.
423 */
424 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
425 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
426 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
427 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
428 if (last_fsb == end_zero_fsb) {
429 /*
430 * The size was only incremented on its last block.
431 * We took care of that above, so just return.
432 */
433 return 0;
434 }
435
436 ASSERT(start_zero_fsb <= end_zero_fsb);
437 while (start_zero_fsb <= end_zero_fsb) {
438 nimaps = 1;
439 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
440 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
441 0, NULL, 0, &imap, &nimaps, NULL, NULL);
442 if (error) {
443 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
444 return error;
445 }
446 ASSERT(nimaps > 0);
447
448 if (imap.br_state == XFS_EXT_UNWRITTEN ||
449 imap.br_startblock == HOLESTARTBLOCK) {
450 /*
451 * This loop handles initializing pages that were
452 * partially initialized by the code below this
453 * loop. It basically zeroes the part of the page
454 * that sits on a hole and sets the page as P_HOLE
455 * and calls remapf if it is a mapped file.
456 */
457 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
458 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
459 continue;
460 }
461
462 /*
463 * There are blocks we need to zero.
464 * Drop the inode lock while we're doing the I/O.
465 * We'll still have the iolock to protect us.
466 */
467 xfs_iunlock(ip, XFS_ILOCK_EXCL);
468
469 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
470 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
471
472 if ((zero_off + zero_len) > offset)
473 zero_len = offset - zero_off;
474
475 error = xfs_iozero(ip, zero_off, zero_len);
476 if (error) {
477 goto out_lock;
478 }
479
480 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
481 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
482
483 xfs_ilock(ip, XFS_ILOCK_EXCL);
484 }
485
486 return 0;
487
488out_lock:
489 xfs_ilock(ip, XFS_ILOCK_EXCL);
490 ASSERT(error >= 0);
491 return error;
492}
493
494ssize_t /* bytes written, or (-) error */
495xfs_write(
496 struct xfs_inode *xip,
497 struct kiocb *iocb,
498 const struct iovec *iovp,
499 unsigned int nsegs,
500 loff_t *offset,
501 int ioflags)
502{
503 struct file *file = iocb->ki_filp;
504 struct address_space *mapping = file->f_mapping;
505 struct inode *inode = mapping->host;
506 unsigned long segs = nsegs;
507 xfs_mount_t *mp;
508 ssize_t ret = 0, error = 0;
509 xfs_fsize_t isize, new_size;
510 int iolock;
511 int eventsent = 0;
512 size_t ocount = 0, count;
513 loff_t pos;
514 int need_i_mutex;
515
516 XFS_STATS_INC(xs_write_calls);
517
518 error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ);
519 if (error)
520 return error;
521
522 count = ocount;
523 pos = *offset;
524
525 if (count == 0)
526 return 0;
527
528 mp = xip->i_mount;
529
530 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
531
532 if (XFS_FORCED_SHUTDOWN(mp))
533 return -EIO;
534
535relock:
536 if (ioflags & IO_ISDIRECT) {
537 iolock = XFS_IOLOCK_SHARED;
538 need_i_mutex = 0;
539 } else {
540 iolock = XFS_IOLOCK_EXCL;
541 need_i_mutex = 1;
542 mutex_lock(&inode->i_mutex);
543 }
544
545 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
546
547start:
548 error = -generic_write_checks(file, &pos, &count,
549 S_ISBLK(inode->i_mode));
550 if (error) {
551 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
552 goto out_unlock_mutex;
553 }
554
555 if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) &&
556 !(ioflags & IO_INVIS) && !eventsent)) {
557 int dmflags = FILP_DELAY_FLAG(file);
558
559 if (need_i_mutex)
560 dmflags |= DM_FLAGS_IMUX;
561
562 xfs_iunlock(xip, XFS_ILOCK_EXCL);
563 error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
564 pos, count, dmflags, &iolock);
565 if (error) {
566 goto out_unlock_internal;
567 }
568 xfs_ilock(xip, XFS_ILOCK_EXCL);
569 eventsent = 1;
570
571 /*
572 * The iolock was dropped and reacquired in XFS_SEND_DATA
573 * so we have to recheck the size when appending.
574 * We will only "goto start;" once, since having sent the
575 * event prevents another call to XFS_SEND_DATA, which is
576 * what allows the size to change in the first place.
577 */
578 if ((file->f_flags & O_APPEND) && pos != xip->i_size)
579 goto start;
580 }
581
582 if (ioflags & IO_ISDIRECT) {
583 xfs_buftarg_t *target =
584 XFS_IS_REALTIME_INODE(xip) ?
585 mp->m_rtdev_targp : mp->m_ddev_targp;
586
587 if ((pos & target->bt_smask) || (count & target->bt_smask)) {
588 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
589 return XFS_ERROR(-EINVAL);
590 }
591
592 if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
593 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
594 iolock = XFS_IOLOCK_EXCL;
595 need_i_mutex = 1;
596 mutex_lock(&inode->i_mutex);
597 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
598 goto start;
599 }
600 }
601
602 new_size = pos + count;
603 if (new_size > xip->i_size)
604 xip->i_new_size = new_size;
605
606 if (likely(!(ioflags & IO_INVIS)))
607 file_update_time(file);
608
609 /*
610 * If the offset is beyond the size of the file, we have a couple
611 * of things to do. First, if there is already space allocated
612 * we need to either create holes or zero the disk or ...
613 *
614 * If there is a page where the previous size lands, we need
615 * to zero it out up to the new size.
616 */
617
618 if (pos > xip->i_size) {
619 error = xfs_zero_eof(xip, pos, xip->i_size);
620 if (error) {
621 xfs_iunlock(xip, XFS_ILOCK_EXCL);
622 goto out_unlock_internal;
623 }
624 }
625 xfs_iunlock(xip, XFS_ILOCK_EXCL);
626
627 /*
628 * If we're writing the file then make sure to clear the
629 * setuid and setgid bits if the process is not being run
630 * by root. This keeps people from modifying setuid and
631 * setgid binaries.
632 */
633
634 if (((xip->i_d.di_mode & S_ISUID) ||
635 ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
636 (S_ISGID | S_IXGRP))) &&
637 !capable(CAP_FSETID)) {
638 error = xfs_write_clear_setuid(xip);
639 if (likely(!error))
640 error = -file_remove_suid(file);
641 if (unlikely(error)) {
642 goto out_unlock_internal;
643 }
644 }
645
646 /* We can write back this queue in page reclaim */
647 current->backing_dev_info = mapping->backing_dev_info;
648
649 if ((ioflags & IO_ISDIRECT)) {
650 if (mapping->nrpages) {
651 WARN_ON(need_i_mutex == 0);
652 error = xfs_flushinval_pages(xip,
653 (pos & PAGE_CACHE_MASK),
654 -1, FI_REMAPF_LOCKED);
655 if (error)
656 goto out_unlock_internal;
657 }
658
659 if (need_i_mutex) {
660 /* demote the lock now the cached pages are gone */
661 xfs_ilock_demote(xip, XFS_IOLOCK_EXCL);
662 mutex_unlock(&inode->i_mutex);
663
664 iolock = XFS_IOLOCK_SHARED;
665 need_i_mutex = 0;
666 }
667
668 trace_xfs_file_direct_write(xip, count, *offset, ioflags);
669 ret = generic_file_direct_write(iocb, iovp,
670 &segs, pos, offset, count, ocount);
671
672 /*
673 * direct-io write to a hole: fall through to buffered I/O
674 * for completing the rest of the request.
675 */
676 if (ret >= 0 && ret != count) {
677 XFS_STATS_ADD(xs_write_bytes, ret);
678
679 pos += ret;
680 count -= ret;
681
682 ioflags &= ~IO_ISDIRECT;
683 xfs_iunlock(xip, iolock);
684 goto relock;
685 }
686 } else {
687 int enospc = 0;
688 ssize_t ret2 = 0;
689
690write_retry:
691 trace_xfs_file_buffered_write(xip, count, *offset, ioflags);
692 ret2 = generic_file_buffered_write(iocb, iovp, segs,
693 pos, offset, count, ret);
694 /*
695 * if we just got an ENOSPC, flush the inode now we
696 * aren't holding any page locks and retry *once*
697 */
698 if (ret2 == -ENOSPC && !enospc) {
699 error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE);
700 if (error)
701 goto out_unlock_internal;
702 enospc = 1;
703 goto write_retry;
704 }
705 ret = ret2;
706 }
707
708 current->backing_dev_info = NULL;
709
710 isize = i_size_read(inode);
711 if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
712 *offset = isize;
713
714 if (*offset > xip->i_size) {
715 xfs_ilock(xip, XFS_ILOCK_EXCL);
716 if (*offset > xip->i_size)
717 xip->i_size = *offset;
718 xfs_iunlock(xip, XFS_ILOCK_EXCL);
719 }
720
721 if (ret == -ENOSPC &&
722 DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
723 xfs_iunlock(xip, iolock);
724 if (need_i_mutex)
725 mutex_unlock(&inode->i_mutex);
726 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
727 DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
728 0, 0, 0); /* Delay flag intentionally unused */
729 if (need_i_mutex)
730 mutex_lock(&inode->i_mutex);
731 xfs_ilock(xip, iolock);
732 if (error)
733 goto out_unlock_internal;
734 goto start;
735 }
736
737 error = -ret;
738 if (ret <= 0)
739 goto out_unlock_internal;
740
741 XFS_STATS_ADD(xs_write_bytes, ret);
742
743 /* Handle various SYNC-type writes */
744 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
745 loff_t end = pos + ret - 1;
746 int error2;
747
748 xfs_iunlock(xip, iolock);
749 if (need_i_mutex)
750 mutex_unlock(&inode->i_mutex);
751
752 error2 = filemap_write_and_wait_range(mapping, pos, end);
753 if (!error)
754 error = error2;
755 if (need_i_mutex)
756 mutex_lock(&inode->i_mutex);
757 xfs_ilock(xip, iolock);
758
759 error2 = xfs_fsync(xip);
760 if (!error)
761 error = error2;
762 }
763
764 out_unlock_internal:
765 if (xip->i_new_size) {
766 xfs_ilock(xip, XFS_ILOCK_EXCL);
767 xip->i_new_size = 0;
768 /*
769 * If this was a direct or synchronous I/O that failed (such
770 * as ENOSPC) then part of the I/O may have been written to
771 * disk before the error occured. In this case the on-disk
772 * file size may have been adjusted beyond the in-memory file
773 * size and now needs to be truncated back.
774 */
775 if (xip->i_d.di_size > xip->i_size)
776 xip->i_d.di_size = xip->i_size;
777 xfs_iunlock(xip, XFS_ILOCK_EXCL);
778 }
779 xfs_iunlock(xip, iolock);
780 out_unlock_mutex:
781 if (need_i_mutex)
782 mutex_unlock(&inode->i_mutex);
783 return -error;
784}
785
786/*
787 * All xfs metadata buffers except log state machine buffers
788 * get this attached as their b_bdstrat callback function.
789 * This is so that we can catch a buffer
790 * after prematurely unpinning it to forcibly shutdown the filesystem.
791 */
792int
793xfs_bdstrat_cb(struct xfs_buf *bp)
794{
795 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
796 trace_xfs_bdstrat_shut(bp, _RET_IP_);
797 /*
798 * Metadata write that didn't get logged but
799 * written delayed anyway. These aren't associated
800 * with a transaction, and can be ignored.
801 */
802 if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
803 (XFS_BUF_ISREAD(bp)) == 0)
804 return (xfs_bioerror_relse(bp));
805 else
806 return (xfs_bioerror(bp));
807 }
808
809 xfs_buf_iorequest(bp);
810 return 0;
811}
812
813/*
814 * Wrapper around bdstrat so that we can stop data from going to disk in case
815 * we are shutting down the filesystem. Typically user data goes thru this
816 * path; one of the exceptions is the superblock.
817 */
818void
819xfsbdstrat(
820 struct xfs_mount *mp,
821 struct xfs_buf *bp)
822{
823 ASSERT(mp);
824 if (!XFS_FORCED_SHUTDOWN(mp)) {
825 xfs_buf_iorequest(bp);
826 return;
827 }
828
829 trace_xfs_bdstrat_shut(bp, _RET_IP_);
830 xfs_bioerror_relse(bp);
831}
832
833/*
834 * If the underlying (data/log/rt) device is readonly, there are some
835 * operations that cannot proceed.
836 */
837int
838xfs_dev_is_read_only(
839 xfs_mount_t *mp,
840 char *message)
841{
842 if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
843 xfs_readonly_buftarg(mp->m_logdev_targp) ||
844 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
845 cmn_err(CE_NOTE,
846 "XFS: %s required on read-only device.", message);
847 cmn_err(CE_NOTE,
848 "XFS: write access unavailable, cannot proceed.");
849 return EROFS;
850 }
851 return 0;
852}
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
deleted file mode 100644
index d1f7789c7ffb..000000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ /dev/null
@@ -1,32 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_LRW_H__
19#define __XFS_LRW_H__
20
21struct xfs_mount;
22struct xfs_inode;
23struct xfs_buf;
24
25/* errors from xfsbdstrat() must be extracted from the buffer */
26extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
27extern int xfs_bdstrat_cb(struct xfs_buf *);
28extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
29
30extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
31
32#endif /* __XFS_LRW_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 3d4a0c84d634..1947514ce1ad 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -44,20 +44,6 @@ xfs_quota_type(int type)
44} 44}
45 45
46STATIC int 46STATIC int
47xfs_fs_quota_sync(
48 struct super_block *sb,
49 int type)
50{
51 struct xfs_mount *mp = XFS_M(sb);
52
53 if (sb->s_flags & MS_RDONLY)
54 return -EROFS;
55 if (!XFS_IS_QUOTA_RUNNING(mp))
56 return -ENOSYS;
57 return -xfs_sync_data(mp, 0);
58}
59
60STATIC int
61xfs_fs_get_xstate( 47xfs_fs_get_xstate(
62 struct super_block *sb, 48 struct super_block *sb,
63 struct fs_quota_stat *fqs) 49 struct fs_quota_stat *fqs)
@@ -82,8 +68,6 @@ xfs_fs_set_xstate(
82 return -EROFS; 68 return -EROFS;
83 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) 69 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
84 return -ENOSYS; 70 return -ENOSYS;
85 if (!capable(CAP_SYS_ADMIN))
86 return -EPERM;
87 71
88 if (uflags & XFS_QUOTA_UDQ_ACCT) 72 if (uflags & XFS_QUOTA_UDQ_ACCT)
89 flags |= XFS_UQUOTA_ACCT; 73 flags |= XFS_UQUOTA_ACCT;
@@ -144,14 +128,11 @@ xfs_fs_set_xquota(
144 return -ENOSYS; 128 return -ENOSYS;
145 if (!XFS_IS_QUOTA_ON(mp)) 129 if (!XFS_IS_QUOTA_ON(mp))
146 return -ESRCH; 130 return -ESRCH;
147 if (!capable(CAP_SYS_ADMIN))
148 return -EPERM;
149 131
150 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); 132 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
151} 133}
152 134
153const struct quotactl_ops xfs_quotactl_operations = { 135const struct quotactl_ops xfs_quotactl_operations = {
154 .quota_sync = xfs_fs_quota_sync,
155 .get_xstate = xfs_fs_get_xstate, 136 .get_xstate = xfs_fs_get_xstate,
156 .set_xstate = xfs_fs_set_xstate, 137 .set_xstate = xfs_fs_set_xstate,
157 .get_xquota = xfs_fs_get_xquota, 138 .get_xquota = xfs_fs_get_xquota,
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 09783cc444ac..52e06b487ced 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -61,6 +61,7 @@
61 61
62#include <linux/namei.h> 62#include <linux/namei.h>
63#include <linux/init.h> 63#include <linux/init.h>
64#include <linux/slab.h>
64#include <linux/mount.h> 65#include <linux/mount.h>
65#include <linux/mempool.h> 66#include <linux/mempool.h>
66#include <linux/writeback.h> 67#include <linux/writeback.h>
@@ -877,12 +878,11 @@ xfsaild(
877{ 878{
878 struct xfs_ail *ailp = data; 879 struct xfs_ail *ailp = data;
879 xfs_lsn_t last_pushed_lsn = 0; 880 xfs_lsn_t last_pushed_lsn = 0;
880 long tout = 0; 881 long tout = 0; /* milliseconds */
881 882
882 while (!kthread_should_stop()) { 883 while (!kthread_should_stop()) {
883 if (tout) 884 schedule_timeout_interruptible(tout ?
884 schedule_timeout_interruptible(msecs_to_jiffies(tout)); 885 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
885 tout = 1000;
886 886
887 /* swsusp */ 887 /* swsusp */
888 try_to_freeze(); 888 try_to_freeze();
@@ -954,16 +954,14 @@ xfs_fs_destroy_inode(
954 ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); 954 ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
955 955
956 /* 956 /*
957 * If we have nothing to flush with this inode then complete the 957 * We always use background reclaim here because even if the
958 * teardown now, otherwise delay the flush operation. 958 * inode is clean, it still may be under IO and hence we have
959 * to take the flush lock. The background reclaim path handles
960 * this more efficiently than we can here, so simply let background
961 * reclaim tear down all inodes.
959 */ 962 */
960 if (!xfs_inode_clean(ip)) {
961 xfs_inode_set_reclaim_tag(ip);
962 return;
963 }
964
965out_reclaim: 963out_reclaim:
966 xfs_ireclaim(ip); 964 xfs_inode_set_reclaim_tag(ip);
967} 965}
968 966
969/* 967/*
@@ -1024,59 +1022,108 @@ xfs_fs_dirty_inode(
1024 XFS_I(inode)->i_update_core = 1; 1022 XFS_I(inode)->i_update_core = 1;
1025} 1023}
1026 1024
1027/* 1025STATIC int
1028 * Attempt to flush the inode, this will actually fail 1026xfs_log_inode(
1029 * if the inode is pinned, but we dirty the inode again 1027 struct xfs_inode *ip)
1030 * at the point when it is unpinned after a log write, 1028{
1031 * since this is when the inode itself becomes flushable. 1029 struct xfs_mount *mp = ip->i_mount;
1032 */ 1030 struct xfs_trans *tp;
1031 int error;
1032
1033 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1034 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
1035 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
1036
1037 if (error) {
1038 xfs_trans_cancel(tp, 0);
1039 /* we need to return with the lock hold shared */
1040 xfs_ilock(ip, XFS_ILOCK_SHARED);
1041 return error;
1042 }
1043
1044 xfs_ilock(ip, XFS_ILOCK_EXCL);
1045
1046 /*
1047 * Note - it's possible that we might have pushed ourselves out of the
1048 * way during trans_reserve which would flush the inode. But there's
1049 * no guarantee that the inode buffer has actually gone out yet (it's
1050 * delwri). Plus the buffer could be pinned anyway if it's part of
1051 * an inode in another recent transaction. So we play it safe and
1052 * fire off the transaction anyway.
1053 */
1054 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1055 xfs_trans_ihold(tp, ip);
1056 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1057 xfs_trans_set_sync(tp);
1058 error = xfs_trans_commit(tp, 0);
1059 xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
1060
1061 return error;
1062}
1063
1033STATIC int 1064STATIC int
1034xfs_fs_write_inode( 1065xfs_fs_write_inode(
1035 struct inode *inode, 1066 struct inode *inode,
1036 int sync) 1067 struct writeback_control *wbc)
1037{ 1068{
1038 struct xfs_inode *ip = XFS_I(inode); 1069 struct xfs_inode *ip = XFS_I(inode);
1039 struct xfs_mount *mp = ip->i_mount; 1070 struct xfs_mount *mp = ip->i_mount;
1040 int error = 0; 1071 int error = EAGAIN;
1041 1072
1042 xfs_itrace_entry(ip); 1073 xfs_itrace_entry(ip);
1043 1074
1044 if (XFS_FORCED_SHUTDOWN(mp)) 1075 if (XFS_FORCED_SHUTDOWN(mp))
1045 return XFS_ERROR(EIO); 1076 return XFS_ERROR(EIO);
1046 1077
1047 if (sync) { 1078 if (wbc->sync_mode == WB_SYNC_ALL) {
1048 error = xfs_wait_on_pages(ip, 0, -1); 1079 /*
1049 if (error) 1080 * Make sure the inode has hit stable storage. By using the
1081 * log and the fsync transactions we reduce the IOs we have
1082 * to do here from two (log and inode) to just the log.
1083 *
1084 * Note: We still need to do a delwri write of the inode after
1085 * this to flush it to the backing buffer so that bulkstat
1086 * works properly if this is the first time the inode has been
1087 * written. Because we hold the ilock atomically over the
1088 * transaction commit and the inode flush we are guaranteed
1089 * that the inode is not pinned when it returns. If the flush
1090 * lock is already held, then the inode has already been
1091 * flushed once and we don't need to flush it again. Hence
1092 * the code will only flush the inode if it isn't already
1093 * being flushed.
1094 */
1095 xfs_ilock(ip, XFS_ILOCK_SHARED);
1096 if (ip->i_update_core) {
1097 error = xfs_log_inode(ip);
1098 if (error)
1099 goto out_unlock;
1100 }
1101 } else {
1102 /*
1103 * We make this non-blocking if the inode is contended, return
1104 * EAGAIN to indicate to the caller that they did not succeed.
1105 * This prevents the flush path from blocking on inodes inside
1106 * another operation right now, they get caught later by xfs_sync.
1107 */
1108 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
1050 goto out; 1109 goto out;
1051 } 1110 }
1052 1111
1053 /* 1112 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
1054 * Bypass inodes which have already been cleaned by 1113 goto out_unlock;
1055 * the inode flush clustering code inside xfs_iflush
1056 */
1057 if (xfs_inode_clean(ip))
1058 goto out;
1059 1114
1060 /* 1115 /*
1061 * We make this non-blocking if the inode is contended, return 1116 * Now we have the flush lock and the inode is not pinned, we can check
1062 * EAGAIN to indicate to the caller that they did not succeed. 1117 * if the inode is really clean as we know that there are no pending
1063 * This prevents the flush path from blocking on inodes inside 1118 * transaction completions, it is not waiting on the delayed write
1064 * another operation right now, they get caught later by xfs_sync. 1119 * queue and there is no IO in progress.
1065 */ 1120 */
1066 if (sync) { 1121 if (xfs_inode_clean(ip)) {
1067 xfs_ilock(ip, XFS_ILOCK_SHARED); 1122 xfs_ifunlock(ip);
1068 xfs_iflock(ip); 1123 error = 0;
1069 1124 goto out_unlock;
1070 error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
1071 } else {
1072 error = EAGAIN;
1073 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
1074 goto out;
1075 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
1076 goto out_unlock;
1077
1078 error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);
1079 } 1125 }
1126 error = xfs_iflush(ip, 0);
1080 1127
1081 out_unlock: 1128 out_unlock:
1082 xfs_iunlock(ip, XFS_ILOCK_SHARED); 1129 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -1259,6 +1306,29 @@ xfs_fs_statfs(
1259 return 0; 1306 return 0;
1260} 1307}
1261 1308
1309STATIC void
1310xfs_save_resvblks(struct xfs_mount *mp)
1311{
1312 __uint64_t resblks = 0;
1313
1314 mp->m_resblks_save = mp->m_resblks;
1315 xfs_reserve_blocks(mp, &resblks, NULL);
1316}
1317
1318STATIC void
1319xfs_restore_resvblks(struct xfs_mount *mp)
1320{
1321 __uint64_t resblks;
1322
1323 if (mp->m_resblks_save) {
1324 resblks = mp->m_resblks_save;
1325 mp->m_resblks_save = 0;
1326 } else
1327 resblks = xfs_default_resblks(mp);
1328
1329 xfs_reserve_blocks(mp, &resblks, NULL);
1330}
1331
1262STATIC int 1332STATIC int
1263xfs_fs_remount( 1333xfs_fs_remount(
1264 struct super_block *sb, 1334 struct super_block *sb,
@@ -1338,11 +1408,27 @@ xfs_fs_remount(
1338 } 1408 }
1339 mp->m_update_flags = 0; 1409 mp->m_update_flags = 0;
1340 } 1410 }
1411
1412 /*
1413 * Fill out the reserve pool if it is empty. Use the stashed
1414 * value if it is non-zero, otherwise go with the default.
1415 */
1416 xfs_restore_resvblks(mp);
1341 } 1417 }
1342 1418
1343 /* rw -> ro */ 1419 /* rw -> ro */
1344 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { 1420 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
1421 /*
1422 * After we have synced the data but before we sync the
1423 * metadata, we need to free up the reserve block pool so that
1424 * the used block count in the superblock on disk is correct at
1425 * the end of the remount. Stash the current reserve pool size
1426 * so that if we get remounted rw, we can return it to the same
1427 * size.
1428 */
1429
1345 xfs_quiesce_data(mp); 1430 xfs_quiesce_data(mp);
1431 xfs_save_resvblks(mp);
1346 xfs_quiesce_attr(mp); 1432 xfs_quiesce_attr(mp);
1347 mp->m_flags |= XFS_MOUNT_RDONLY; 1433 mp->m_flags |= XFS_MOUNT_RDONLY;
1348 } 1434 }
@@ -1361,11 +1447,22 @@ xfs_fs_freeze(
1361{ 1447{
1362 struct xfs_mount *mp = XFS_M(sb); 1448 struct xfs_mount *mp = XFS_M(sb);
1363 1449
1450 xfs_save_resvblks(mp);
1364 xfs_quiesce_attr(mp); 1451 xfs_quiesce_attr(mp);
1365 return -xfs_fs_log_dummy(mp); 1452 return -xfs_fs_log_dummy(mp);
1366} 1453}
1367 1454
1368STATIC int 1455STATIC int
1456xfs_fs_unfreeze(
1457 struct super_block *sb)
1458{
1459 struct xfs_mount *mp = XFS_M(sb);
1460
1461 xfs_restore_resvblks(mp);
1462 return 0;
1463}
1464
1465STATIC int
1369xfs_fs_show_options( 1466xfs_fs_show_options(
1370 struct seq_file *m, 1467 struct seq_file *m,
1371 struct vfsmount *mnt) 1468 struct vfsmount *mnt)
@@ -1587,6 +1684,7 @@ static const struct super_operations xfs_super_operations = {
1587 .put_super = xfs_fs_put_super, 1684 .put_super = xfs_fs_put_super,
1588 .sync_fs = xfs_fs_sync_fs, 1685 .sync_fs = xfs_fs_sync_fs,
1589 .freeze_fs = xfs_fs_freeze, 1686 .freeze_fs = xfs_fs_freeze,
1687 .unfreeze_fs = xfs_fs_unfreeze,
1590 .statfs = xfs_fs_statfs, 1688 .statfs = xfs_fs_statfs,
1591 .remount_fs = xfs_fs_remount, 1689 .remount_fs = xfs_fs_remount,
1592 .show_options = xfs_fs_show_options, 1690 .show_options = xfs_fs_show_options,
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 6fed97a8cd3e..fd9698215759 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -65,7 +65,6 @@ xfs_inode_ag_lookup(
65 * as the tree is sparse and a gang lookup walks to find 65 * as the tree is sparse and a gang lookup walks to find
66 * the number of objects requested. 66 * the number of objects requested.
67 */ 67 */
68 read_lock(&pag->pag_ici_lock);
69 if (tag == XFS_ICI_NO_TAG) { 68 if (tag == XFS_ICI_NO_TAG) {
70 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 69 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
71 (void **)&ip, *first_index, 1); 70 (void **)&ip, *first_index, 1);
@@ -74,7 +73,7 @@ xfs_inode_ag_lookup(
74 (void **)&ip, *first_index, 1, tag); 73 (void **)&ip, *first_index, 1, tag);
75 } 74 }
76 if (!nr_found) 75 if (!nr_found)
77 goto unlock; 76 return NULL;
78 77
79 /* 78 /*
80 * Update the index for the next lookup. Catch overflows 79 * Update the index for the next lookup. Catch overflows
@@ -84,25 +83,20 @@ xfs_inode_ag_lookup(
84 */ 83 */
85 *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 84 *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
86 if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 85 if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
87 goto unlock; 86 return NULL;
88
89 return ip; 87 return ip;
90
91unlock:
92 read_unlock(&pag->pag_ici_lock);
93 return NULL;
94} 88}
95 89
96STATIC int 90STATIC int
97xfs_inode_ag_walk( 91xfs_inode_ag_walk(
98 struct xfs_mount *mp, 92 struct xfs_mount *mp,
99 xfs_agnumber_t ag, 93 struct xfs_perag *pag,
100 int (*execute)(struct xfs_inode *ip, 94 int (*execute)(struct xfs_inode *ip,
101 struct xfs_perag *pag, int flags), 95 struct xfs_perag *pag, int flags),
102 int flags, 96 int flags,
103 int tag) 97 int tag,
98 int exclusive)
104{ 99{
105 struct xfs_perag *pag = &mp->m_perag[ag];
106 uint32_t first_index; 100 uint32_t first_index;
107 int last_error = 0; 101 int last_error = 0;
108 int skipped; 102 int skipped;
@@ -114,10 +108,20 @@ restart:
114 int error = 0; 108 int error = 0;
115 xfs_inode_t *ip; 109 xfs_inode_t *ip;
116 110
111 if (exclusive)
112 write_lock(&pag->pag_ici_lock);
113 else
114 read_lock(&pag->pag_ici_lock);
117 ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); 115 ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
118 if (!ip) 116 if (!ip) {
117 if (exclusive)
118 write_unlock(&pag->pag_ici_lock);
119 else
120 read_unlock(&pag->pag_ici_lock);
119 break; 121 break;
122 }
120 123
124 /* execute releases pag->pag_ici_lock */
121 error = execute(ip, pag, flags); 125 error = execute(ip, pag, flags);
122 if (error == EAGAIN) { 126 if (error == EAGAIN) {
123 skipped++; 127 skipped++;
@@ -125,9 +129,8 @@ restart:
125 } 129 }
126 if (error) 130 if (error)
127 last_error = error; 131 last_error = error;
128 /* 132
129 * bail out if the filesystem is corrupted. 133 /* bail out if the filesystem is corrupted. */
130 */
131 if (error == EFSCORRUPTED) 134 if (error == EFSCORRUPTED)
132 break; 135 break;
133 136
@@ -137,8 +140,6 @@ restart:
137 delay(1); 140 delay(1);
138 goto restart; 141 goto restart;
139 } 142 }
140
141 xfs_put_perag(mp, pag);
142 return last_error; 143 return last_error;
143} 144}
144 145
@@ -148,16 +149,24 @@ xfs_inode_ag_iterator(
148 int (*execute)(struct xfs_inode *ip, 149 int (*execute)(struct xfs_inode *ip,
149 struct xfs_perag *pag, int flags), 150 struct xfs_perag *pag, int flags),
150 int flags, 151 int flags,
151 int tag) 152 int tag,
153 int exclusive)
152{ 154{
153 int error = 0; 155 int error = 0;
154 int last_error = 0; 156 int last_error = 0;
155 xfs_agnumber_t ag; 157 xfs_agnumber_t ag;
156 158
157 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 159 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
158 if (!mp->m_perag[ag].pag_ici_init) 160 struct xfs_perag *pag;
161
162 pag = xfs_perag_get(mp, ag);
163 if (!pag->pag_ici_init) {
164 xfs_perag_put(pag);
159 continue; 165 continue;
160 error = xfs_inode_ag_walk(mp, ag, execute, flags, tag); 166 }
167 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
168 exclusive);
169 xfs_perag_put(pag);
161 if (error) { 170 if (error) {
162 last_error = error; 171 last_error = error;
163 if (error == EFSCORRUPTED) 172 if (error == EFSCORRUPTED)
@@ -174,30 +183,31 @@ xfs_sync_inode_valid(
174 struct xfs_perag *pag) 183 struct xfs_perag *pag)
175{ 184{
176 struct inode *inode = VFS_I(ip); 185 struct inode *inode = VFS_I(ip);
186 int error = EFSCORRUPTED;
177 187
178 /* nothing to sync during shutdown */ 188 /* nothing to sync during shutdown */
179 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 189 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
180 read_unlock(&pag->pag_ici_lock); 190 goto out_unlock;
181 return EFSCORRUPTED;
182 }
183 191
184 /* 192 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
185 * If we can't get a reference on the inode, it must be in reclaim. 193 error = ENOENT;
186 * Leave it for the reclaim code to flush. Also avoid inodes that 194 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
187 * haven't been fully initialised. 195 goto out_unlock;
188 */ 196
189 if (!igrab(inode)) { 197 /* If we can't grab the inode, it must on it's way to reclaim. */
190 read_unlock(&pag->pag_ici_lock); 198 if (!igrab(inode))
191 return ENOENT; 199 goto out_unlock;
192 }
193 read_unlock(&pag->pag_ici_lock);
194 200
195 if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) { 201 if (is_bad_inode(inode)) {
196 IRELE(ip); 202 IRELE(ip);
197 return ENOENT; 203 goto out_unlock;
198 } 204 }
199 205
200 return 0; 206 /* inode is valid */
207 error = 0;
208out_unlock:
209 read_unlock(&pag->pag_ici_lock);
210 return error;
201} 211}
202 212
203STATIC int 213STATIC int
@@ -224,7 +234,7 @@ xfs_sync_inode_data(
224 } 234 }
225 235
226 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? 236 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
227 0 : XFS_B_ASYNC, FI_NONE); 237 0 : XBF_ASYNC, FI_NONE);
228 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 238 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
229 239
230 out_wait: 240 out_wait:
@@ -260,8 +270,7 @@ xfs_sync_inode_attr(
260 goto out_unlock; 270 goto out_unlock;
261 } 271 }
262 272
263 error = xfs_iflush(ip, (flags & SYNC_WAIT) ? 273 error = xfs_iflush(ip, flags);
264 XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
265 274
266 out_unlock: 275 out_unlock:
267 xfs_iunlock(ip, XFS_ILOCK_SHARED); 276 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -282,14 +291,11 @@ xfs_sync_data(
282 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); 291 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
283 292
284 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, 293 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
285 XFS_ICI_NO_TAG); 294 XFS_ICI_NO_TAG, 0);
286 if (error) 295 if (error)
287 return XFS_ERROR(error); 296 return XFS_ERROR(error);
288 297
289 xfs_log_force(mp, 0, 298 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
290 (flags & SYNC_WAIT) ?
291 XFS_LOG_FORCE | XFS_LOG_SYNC :
292 XFS_LOG_FORCE);
293 return 0; 299 return 0;
294} 300}
295 301
@@ -304,7 +310,7 @@ xfs_sync_attr(
304 ASSERT((flags & ~SYNC_WAIT) == 0); 310 ASSERT((flags & ~SYNC_WAIT) == 0);
305 311
306 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, 312 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
307 XFS_ICI_NO_TAG); 313 XFS_ICI_NO_TAG, 0);
308} 314}
309 315
310STATIC int 316STATIC int
@@ -315,10 +321,6 @@ xfs_commit_dummy_trans(
315 struct xfs_inode *ip = mp->m_rootip; 321 struct xfs_inode *ip = mp->m_rootip;
316 struct xfs_trans *tp; 322 struct xfs_trans *tp;
317 int error; 323 int error;
318 int log_flags = XFS_LOG_FORCE;
319
320 if (flags & SYNC_WAIT)
321 log_flags |= XFS_LOG_SYNC;
322 324
323 /* 325 /*
324 * Put a dummy transaction in the log to tell recovery 326 * Put a dummy transaction in the log to tell recovery
@@ -340,11 +342,11 @@ xfs_commit_dummy_trans(
340 xfs_iunlock(ip, XFS_ILOCK_EXCL); 342 xfs_iunlock(ip, XFS_ILOCK_EXCL);
341 343
342 /* the log force ensures this transaction is pushed to disk */ 344 /* the log force ensures this transaction is pushed to disk */
343 xfs_log_force(mp, 0, log_flags); 345 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
344 return error; 346 return error;
345} 347}
346 348
347int 349STATIC int
348xfs_sync_fsdata( 350xfs_sync_fsdata(
349 struct xfs_mount *mp, 351 struct xfs_mount *mp,
350 int flags) 352 int flags)
@@ -360,7 +362,7 @@ xfs_sync_fsdata(
360 if (flags & SYNC_TRYLOCK) { 362 if (flags & SYNC_TRYLOCK) {
361 ASSERT(!(flags & SYNC_WAIT)); 363 ASSERT(!(flags & SYNC_WAIT));
362 364
363 bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); 365 bp = xfs_getsb(mp, XBF_TRYLOCK);
364 if (!bp) 366 if (!bp)
365 goto out; 367 goto out;
366 368
@@ -380,7 +382,7 @@ xfs_sync_fsdata(
380 * become pinned in between there and here. 382 * become pinned in between there and here.
381 */ 383 */
382 if (XFS_BUF_ISPINNED(bp)) 384 if (XFS_BUF_ISPINNED(bp))
383 xfs_log_force(mp, 0, XFS_LOG_FORCE); 385 xfs_log_force(mp, 0);
384 } 386 }
385 387
386 388
@@ -441,9 +443,6 @@ xfs_quiesce_data(
441 xfs_sync_data(mp, SYNC_WAIT); 443 xfs_sync_data(mp, SYNC_WAIT);
442 xfs_qm_sync(mp, SYNC_WAIT); 444 xfs_qm_sync(mp, SYNC_WAIT);
443 445
444 /* drop inode references pinned by filestreams */
445 xfs_filestream_flush(mp);
446
447 /* write superblock and hoover up shutdown errors */ 446 /* write superblock and hoover up shutdown errors */
448 error = xfs_sync_fsdata(mp, SYNC_WAIT); 447 error = xfs_sync_fsdata(mp, SYNC_WAIT);
449 448
@@ -460,16 +459,18 @@ xfs_quiesce_fs(
460{ 459{
461 int count = 0, pincount; 460 int count = 0, pincount;
462 461
462 xfs_reclaim_inodes(mp, 0);
463 xfs_flush_buftarg(mp->m_ddev_targp, 0); 463 xfs_flush_buftarg(mp->m_ddev_targp, 0);
464 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
465 464
466 /* 465 /*
467 * This loop must run at least twice. The first instance of the loop 466 * This loop must run at least twice. The first instance of the loop
468 * will flush most meta data but that will generate more meta data 467 * will flush most meta data but that will generate more meta data
469 * (typically directory updates). Which then must be flushed and 468 * (typically directory updates). Which then must be flushed and
470 * logged before we can write the unmount record. 469 * logged before we can write the unmount record. We also so sync
470 * reclaim of inodes to catch any that the above delwri flush skipped.
471 */ 471 */
472 do { 472 do {
473 xfs_reclaim_inodes(mp, SYNC_WAIT);
473 xfs_sync_attr(mp, SYNC_WAIT); 474 xfs_sync_attr(mp, SYNC_WAIT);
474 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); 475 pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
475 if (!pincount) { 476 if (!pincount) {
@@ -568,7 +569,7 @@ xfs_flush_inodes(
568 igrab(inode); 569 igrab(inode);
569 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); 570 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
570 wait_for_completion(&completion); 571 wait_for_completion(&completion);
571 xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); 572 xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
572} 573}
573 574
574/* 575/*
@@ -584,8 +585,8 @@ xfs_sync_worker(
584 int error; 585 int error;
585 586
586 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 587 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
587 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 588 xfs_log_force(mp, 0);
588 xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC); 589 xfs_reclaim_inodes(mp, 0);
589 /* dgc: errors ignored here */ 590 /* dgc: errors ignored here */
590 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 591 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
591 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); 592 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
@@ -606,7 +607,8 @@ xfssyncd(
606 set_freezable(); 607 set_freezable();
607 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); 608 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
608 for (;;) { 609 for (;;) {
609 timeleft = schedule_timeout_interruptible(timeleft); 610 if (list_empty(&mp->m_sync_list))
611 timeleft = schedule_timeout_interruptible(timeleft);
610 /* swsusp */ 612 /* swsusp */
611 try_to_freeze(); 613 try_to_freeze();
612 if (kthread_should_stop() && list_empty(&mp->m_sync_list)) 614 if (kthread_should_stop() && list_empty(&mp->m_sync_list))
@@ -626,8 +628,7 @@ xfssyncd(
626 list_add_tail(&mp->m_sync_work.w_list, 628 list_add_tail(&mp->m_sync_work.w_list,
627 &mp->m_sync_list); 629 &mp->m_sync_list);
628 } 630 }
629 list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list) 631 list_splice_init(&mp->m_sync_list, &tmp);
630 list_move(&work->w_list, &tmp);
631 spin_unlock(&mp->m_sync_lock); 632 spin_unlock(&mp->m_sync_lock);
632 633
633 list_for_each_entry_safe(work, n, &tmp, w_list) { 634 list_for_each_entry_safe(work, n, &tmp, w_list) {
@@ -664,60 +665,6 @@ xfs_syncd_stop(
664 kthread_stop(mp->m_sync_task); 665 kthread_stop(mp->m_sync_task);
665} 666}
666 667
667STATIC int
668xfs_reclaim_inode(
669 xfs_inode_t *ip,
670 int sync_mode)
671{
672 xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
673
674 /* The hash lock here protects a thread in xfs_iget_core from
675 * racing with us on linking the inode back with a vnode.
676 * Once we have the XFS_IRECLAIM flag set it will not touch
677 * us.
678 */
679 write_lock(&pag->pag_ici_lock);
680 spin_lock(&ip->i_flags_lock);
681 if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
682 !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
683 spin_unlock(&ip->i_flags_lock);
684 write_unlock(&pag->pag_ici_lock);
685 return -EAGAIN;
686 }
687 __xfs_iflags_set(ip, XFS_IRECLAIM);
688 spin_unlock(&ip->i_flags_lock);
689 write_unlock(&pag->pag_ici_lock);
690 xfs_put_perag(ip->i_mount, pag);
691
692 /*
693 * If the inode is still dirty, then flush it out. If the inode
694 * is not in the AIL, then it will be OK to flush it delwri as
695 * long as xfs_iflush() does not keep any references to the inode.
696 * We leave that decision up to xfs_iflush() since it has the
697 * knowledge of whether it's OK to simply do a delwri flush of
698 * the inode or whether we need to wait until the inode is
699 * pulled from the AIL.
700 * We get the flush lock regardless, though, just to make sure
701 * we don't free it while it is being flushed.
702 */
703 xfs_ilock(ip, XFS_ILOCK_EXCL);
704 xfs_iflock(ip);
705
706 /*
707 * In the case of a forced shutdown we rely on xfs_iflush() to
708 * wait for the inode to be unpinned before returning an error.
709 */
710 if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
711 /* synchronize with xfs_iflush_done */
712 xfs_iflock(ip);
713 xfs_ifunlock(ip);
714 }
715
716 xfs_iunlock(ip, XFS_ILOCK_EXCL);
717 xfs_ireclaim(ip);
718 return 0;
719}
720
721void 668void
722__xfs_inode_set_reclaim_tag( 669__xfs_inode_set_reclaim_tag(
723 struct xfs_perag *pag, 670 struct xfs_perag *pag,
@@ -737,16 +684,17 @@ void
737xfs_inode_set_reclaim_tag( 684xfs_inode_set_reclaim_tag(
738 xfs_inode_t *ip) 685 xfs_inode_t *ip)
739{ 686{
740 xfs_mount_t *mp = ip->i_mount; 687 struct xfs_mount *mp = ip->i_mount;
741 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); 688 struct xfs_perag *pag;
742 689
743 read_lock(&pag->pag_ici_lock); 690 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
691 write_lock(&pag->pag_ici_lock);
744 spin_lock(&ip->i_flags_lock); 692 spin_lock(&ip->i_flags_lock);
745 __xfs_inode_set_reclaim_tag(pag, ip); 693 __xfs_inode_set_reclaim_tag(pag, ip);
746 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 694 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
747 spin_unlock(&ip->i_flags_lock); 695 spin_unlock(&ip->i_flags_lock);
748 read_unlock(&pag->pag_ici_lock); 696 write_unlock(&pag->pag_ici_lock);
749 xfs_put_perag(mp, pag); 697 xfs_perag_put(pag);
750} 698}
751 699
752void 700void
@@ -759,20 +707,145 @@ __xfs_inode_clear_reclaim_tag(
759 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 707 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
760} 708}
761 709
710/*
711 * Inodes in different states need to be treated differently, and the return
712 * value of xfs_iflush is not sufficient to get this right. The following table
713 * lists the inode states and the reclaim actions necessary for non-blocking
714 * reclaim:
715 *
716 *
717 * inode state iflush ret required action
718 * --------------- ---------- ---------------
719 * bad - reclaim
720 * shutdown EIO unpin and reclaim
721 * clean, unpinned 0 reclaim
722 * stale, unpinned 0 reclaim
723 * clean, pinned(*) 0 requeue
724 * stale, pinned EAGAIN requeue
725 * dirty, delwri ok 0 requeue
726 * dirty, delwri blocked EAGAIN requeue
727 * dirty, sync flush 0 reclaim
728 *
729 * (*) dgc: I don't think the clean, pinned state is possible but it gets
730 * handled anyway given the order of checks implemented.
731 *
732 * As can be seen from the table, the return value of xfs_iflush() is not
733 * sufficient to correctly decide the reclaim action here. The checks in
734 * xfs_iflush() might look like duplicates, but they are not.
735 *
736 * Also, because we get the flush lock first, we know that any inode that has
737 * been flushed delwri has had the flush completed by the time we check that
738 * the inode is clean. The clean inode check needs to be done before flushing
739 * the inode delwri otherwise we would loop forever requeuing clean inodes as
740 * we cannot tell apart a successful delwri flush and a clean inode from the
741 * return value of xfs_iflush().
742 *
743 * Note that because the inode is flushed delayed write by background
744 * writeback, the flush lock may already be held here and waiting on it can
745 * result in very long latencies. Hence for sync reclaims, where we wait on the
746 * flush lock, the caller should push out delayed write inodes first before
747 * trying to reclaim them to minimise the amount of time spent waiting. For
748 * background relaim, we just requeue the inode for the next pass.
749 *
750 * Hence the order of actions after gaining the locks should be:
751 * bad => reclaim
752 * shutdown => unpin and reclaim
753 * pinned, delwri => requeue
754 * pinned, sync => unpin
755 * stale => reclaim
756 * clean => reclaim
757 * dirty, delwri => flush and requeue
758 * dirty, sync => flush, wait and reclaim
759 */
762STATIC int 760STATIC int
763xfs_reclaim_inode_now( 761xfs_reclaim_inode(
764 struct xfs_inode *ip, 762 struct xfs_inode *ip,
765 struct xfs_perag *pag, 763 struct xfs_perag *pag,
766 int flags) 764 int sync_mode)
767{ 765{
768 /* ignore if already under reclaim */ 766 int error = 0;
769 if (xfs_iflags_test(ip, XFS_IRECLAIM)) { 767
770 read_unlock(&pag->pag_ici_lock); 768 /*
769 * The radix tree lock here protects a thread in xfs_iget from racing
770 * with us starting reclaim on the inode. Once we have the
771 * XFS_IRECLAIM flag set it will not touch us.
772 */
773 spin_lock(&ip->i_flags_lock);
774 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
775 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
776 /* ignore as it is already under reclaim */
777 spin_unlock(&ip->i_flags_lock);
778 write_unlock(&pag->pag_ici_lock);
771 return 0; 779 return 0;
772 } 780 }
773 read_unlock(&pag->pag_ici_lock); 781 __xfs_iflags_set(ip, XFS_IRECLAIM);
782 spin_unlock(&ip->i_flags_lock);
783 write_unlock(&pag->pag_ici_lock);
784
785 xfs_ilock(ip, XFS_ILOCK_EXCL);
786 if (!xfs_iflock_nowait(ip)) {
787 if (!(sync_mode & SYNC_WAIT))
788 goto out;
789 xfs_iflock(ip);
790 }
791
792 if (is_bad_inode(VFS_I(ip)))
793 goto reclaim;
794 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
795 xfs_iunpin_wait(ip);
796 goto reclaim;
797 }
798 if (xfs_ipincount(ip)) {
799 if (!(sync_mode & SYNC_WAIT)) {
800 xfs_ifunlock(ip);
801 goto out;
802 }
803 xfs_iunpin_wait(ip);
804 }
805 if (xfs_iflags_test(ip, XFS_ISTALE))
806 goto reclaim;
807 if (xfs_inode_clean(ip))
808 goto reclaim;
809
810 /* Now we have an inode that needs flushing */
811 error = xfs_iflush(ip, sync_mode);
812 if (sync_mode & SYNC_WAIT) {
813 xfs_iflock(ip);
814 goto reclaim;
815 }
816
817 /*
818 * When we have to flush an inode but don't have SYNC_WAIT set, we
819 * flush the inode out using a delwri buffer and wait for the next
820 * call into reclaim to find it in a clean state instead of waiting for
821 * it now. We also don't return errors here - if the error is transient
822 * then the next reclaim pass will flush the inode, and if the error
823 * is permanent then the next sync reclaim will reclaim the inode and
824 * pass on the error.
825 */
826 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
827 xfs_fs_cmn_err(CE_WARN, ip->i_mount,
828 "inode 0x%llx background reclaim flush failed with %d",
829 (long long)ip->i_ino, error);
830 }
831out:
832 xfs_iflags_clear(ip, XFS_IRECLAIM);
833 xfs_iunlock(ip, XFS_ILOCK_EXCL);
834 /*
835 * We could return EAGAIN here to make reclaim rescan the inode tree in
836 * a short while. However, this just burns CPU time scanning the tree
837 * waiting for IO to complete and xfssyncd never goes back to the idle
838 * state. Instead, return 0 to let the next scheduled background reclaim
839 * attempt to reclaim the inode again.
840 */
841 return 0;
842
843reclaim:
844 xfs_ifunlock(ip);
845 xfs_iunlock(ip, XFS_ILOCK_EXCL);
846 xfs_ireclaim(ip);
847 return error;
774 848
775 return xfs_reclaim_inode(ip, flags);
776} 849}
777 850
778int 851int
@@ -780,6 +853,6 @@ xfs_reclaim_inodes(
780 xfs_mount_t *mp, 853 xfs_mount_t *mp,
781 int mode) 854 int mode)
782{ 855{
783 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode, 856 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
784 XFS_ICI_RECLAIM_TAG); 857 XFS_ICI_RECLAIM_TAG, 1);
785} 858}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index a500b4d91835..d480c346cabb 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -37,7 +37,6 @@ void xfs_syncd_stop(struct xfs_mount *mp);
37 37
38int xfs_sync_attr(struct xfs_mount *mp, int flags); 38int xfs_sync_attr(struct xfs_mount *mp, int flags);
39int xfs_sync_data(struct xfs_mount *mp, int flags); 39int xfs_sync_data(struct xfs_mount *mp, int flags);
40int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
41 40
42int xfs_quiesce_data(struct xfs_mount *mp); 41int xfs_quiesce_data(struct xfs_mount *mp);
43void xfs_quiesce_attr(struct xfs_mount *mp); 42void xfs_quiesce_attr(struct xfs_mount *mp);
@@ -54,6 +53,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
54int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); 53int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
55int xfs_inode_ag_iterator(struct xfs_mount *mp, 54int xfs_inode_ag_iterator(struct xfs_mount *mp,
56 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 55 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
57 int flags, int tag); 56 int flags, int tag, int write_lock);
58 57
59#endif 58#endif
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 856eb3c8d605..5a107601e969 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -52,22 +52,6 @@
52#include "quota/xfs_dquot.h" 52#include "quota/xfs_dquot.h"
53 53
54/* 54/*
55 * Format fsblock number into a static buffer & return it.
56 */
57STATIC char *xfs_fmtfsblock(xfs_fsblock_t bno)
58{
59 static char rval[50];
60
61 if (bno == NULLFSBLOCK)
62 sprintf(rval, "NULLFSBLOCK");
63 else if (isnullstartblock(bno))
64 sprintf(rval, "NULLSTARTBLOCK(%lld)", startblockval(bno));
65 else
66 sprintf(rval, "%lld", (xfs_dfsbno_t)bno);
67 return rval;
68}
69
70/*
71 * We include this last to have the helpers above available for the trace 55 * We include this last to have the helpers above available for the trace
72 * event implementations. 56 * event implementations.
73 */ 57 */
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index c40834bdee58..fcaa62f0799e 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -33,51 +33,82 @@ struct xfs_dquot;
33struct xlog_ticket; 33struct xlog_ticket;
34struct log; 34struct log;
35 35
36#define DEFINE_ATTR_LIST_EVENT(name) \ 36DECLARE_EVENT_CLASS(xfs_attr_list_class,
37 TP_PROTO(struct xfs_attr_list_context *ctx),
38 TP_ARGS(ctx),
39 TP_STRUCT__entry(
40 __field(dev_t, dev)
41 __field(xfs_ino_t, ino)
42 __field(u32, hashval)
43 __field(u32, blkno)
44 __field(u32, offset)
45 __field(void *, alist)
46 __field(int, bufsize)
47 __field(int, count)
48 __field(int, firstu)
49 __field(int, dupcnt)
50 __field(int, flags)
51 ),
52 TP_fast_assign(
53 __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
54 __entry->ino = ctx->dp->i_ino;
55 __entry->hashval = ctx->cursor->hashval;
56 __entry->blkno = ctx->cursor->blkno;
57 __entry->offset = ctx->cursor->offset;
58 __entry->alist = ctx->alist;
59 __entry->bufsize = ctx->bufsize;
60 __entry->count = ctx->count;
61 __entry->firstu = ctx->firstu;
62 __entry->flags = ctx->flags;
63 ),
64 TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
65 "alist 0x%p size %u count %u firstu %u flags %d %s",
66 MAJOR(__entry->dev), MINOR(__entry->dev),
67 __entry->ino,
68 __entry->hashval,
69 __entry->blkno,
70 __entry->offset,
71 __entry->dupcnt,
72 __entry->alist,
73 __entry->bufsize,
74 __entry->count,
75 __entry->firstu,
76 __entry->flags,
77 __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS)
78 )
79)
80
81#define DEFINE_PERAG_REF_EVENT(name) \
37TRACE_EVENT(name, \ 82TRACE_EVENT(name, \
38 TP_PROTO(struct xfs_attr_list_context *ctx), \ 83 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
39 TP_ARGS(ctx), \ 84 unsigned long caller_ip), \
85 TP_ARGS(mp, agno, refcount, caller_ip), \
40 TP_STRUCT__entry( \ 86 TP_STRUCT__entry( \
41 __field(dev_t, dev) \ 87 __field(dev_t, dev) \
42 __field(xfs_ino_t, ino) \ 88 __field(xfs_agnumber_t, agno) \
43 __field(u32, hashval) \ 89 __field(int, refcount) \
44 __field(u32, blkno) \ 90 __field(unsigned long, caller_ip) \
45 __field(u32, offset) \
46 __field(void *, alist) \
47 __field(int, bufsize) \
48 __field(int, count) \
49 __field(int, firstu) \
50 __field(int, dupcnt) \
51 __field(int, flags) \
52 ), \ 91 ), \
53 TP_fast_assign( \ 92 TP_fast_assign( \
54 __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; \ 93 __entry->dev = mp->m_super->s_dev; \
55 __entry->ino = ctx->dp->i_ino; \ 94 __entry->agno = agno; \
56 __entry->hashval = ctx->cursor->hashval; \ 95 __entry->refcount = refcount; \
57 __entry->blkno = ctx->cursor->blkno; \ 96 __entry->caller_ip = caller_ip; \
58 __entry->offset = ctx->cursor->offset; \
59 __entry->alist = ctx->alist; \
60 __entry->bufsize = ctx->bufsize; \
61 __entry->count = ctx->count; \
62 __entry->firstu = ctx->firstu; \
63 __entry->flags = ctx->flags; \
64 ), \ 97 ), \
65 TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " \ 98 TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
66 "alist 0x%p size %u count %u firstu %u flags %d %s", \
67 MAJOR(__entry->dev), MINOR(__entry->dev), \ 99 MAJOR(__entry->dev), MINOR(__entry->dev), \
68 __entry->ino, \ 100 __entry->agno, \
69 __entry->hashval, \ 101 __entry->refcount, \
70 __entry->blkno, \ 102 (char *)__entry->caller_ip) \
71 __entry->offset, \ 103);
72 __entry->dupcnt, \ 104
73 __entry->alist, \ 105DEFINE_PERAG_REF_EVENT(xfs_perag_get)
74 __entry->bufsize, \ 106DEFINE_PERAG_REF_EVENT(xfs_perag_put)
75 __entry->count, \ 107
76 __entry->firstu, \ 108#define DEFINE_ATTR_LIST_EVENT(name) \
77 __entry->flags, \ 109DEFINE_EVENT(xfs_attr_list_class, name, \
78 __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS) \ 110 TP_PROTO(struct xfs_attr_list_context *ctx), \
79 ) \ 111 TP_ARGS(ctx))
80)
81DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf); 112DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf);
82DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all); 113DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all);
83DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf); 114DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf);
@@ -166,103 +197,111 @@ TRACE_EVENT(xfs_iext_insert,
166 __entry->caller_ip = caller_ip; 197 __entry->caller_ip = caller_ip;
167 ), 198 ),
168 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " 199 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
169 "offset %lld block %s count %lld flag %d caller %pf", 200 "offset %lld block %lld count %lld flag %d caller %pf",
170 MAJOR(__entry->dev), MINOR(__entry->dev), 201 MAJOR(__entry->dev), MINOR(__entry->dev),
171 __entry->ino, 202 __entry->ino,
172 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), 203 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
173 (long)__entry->idx, 204 (long)__entry->idx,
174 __entry->startoff, 205 __entry->startoff,
175 xfs_fmtfsblock(__entry->startblock), 206 (__int64_t)__entry->startblock,
176 __entry->blockcount, 207 __entry->blockcount,
177 __entry->state, 208 __entry->state,
178 (char *)__entry->caller_ip) 209 (char *)__entry->caller_ip)
179); 210);
180 211
212DECLARE_EVENT_CLASS(xfs_bmap_class,
213 TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state,
214 unsigned long caller_ip),
215 TP_ARGS(ip, idx, state, caller_ip),
216 TP_STRUCT__entry(
217 __field(dev_t, dev)
218 __field(xfs_ino_t, ino)
219 __field(xfs_extnum_t, idx)
220 __field(xfs_fileoff_t, startoff)
221 __field(xfs_fsblock_t, startblock)
222 __field(xfs_filblks_t, blockcount)
223 __field(xfs_exntst_t, state)
224 __field(int, bmap_state)
225 __field(unsigned long, caller_ip)
226 ),
227 TP_fast_assign(
228 struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ?
229 ip->i_afp : &ip->i_df;
230 struct xfs_bmbt_irec r;
231
232 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r);
233 __entry->dev = VFS_I(ip)->i_sb->s_dev;
234 __entry->ino = ip->i_ino;
235 __entry->idx = idx;
236 __entry->startoff = r.br_startoff;
237 __entry->startblock = r.br_startblock;
238 __entry->blockcount = r.br_blockcount;
239 __entry->state = r.br_state;
240 __entry->bmap_state = state;
241 __entry->caller_ip = caller_ip;
242 ),
243 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
244 "offset %lld block %lld count %lld flag %d caller %pf",
245 MAJOR(__entry->dev), MINOR(__entry->dev),
246 __entry->ino,
247 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
248 (long)__entry->idx,
249 __entry->startoff,
250 (__int64_t)__entry->startblock,
251 __entry->blockcount,
252 __entry->state,
253 (char *)__entry->caller_ip)
254)
255
181#define DEFINE_BMAP_EVENT(name) \ 256#define DEFINE_BMAP_EVENT(name) \
182TRACE_EVENT(name, \ 257DEFINE_EVENT(xfs_bmap_class, name, \
183 TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \ 258 TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \
184 unsigned long caller_ip), \ 259 unsigned long caller_ip), \
185 TP_ARGS(ip, idx, state, caller_ip), \ 260 TP_ARGS(ip, idx, state, caller_ip))
186 TP_STRUCT__entry( \
187 __field(dev_t, dev) \
188 __field(xfs_ino_t, ino) \
189 __field(xfs_extnum_t, idx) \
190 __field(xfs_fileoff_t, startoff) \
191 __field(xfs_fsblock_t, startblock) \
192 __field(xfs_filblks_t, blockcount) \
193 __field(xfs_exntst_t, state) \
194 __field(int, bmap_state) \
195 __field(unsigned long, caller_ip) \
196 ), \
197 TP_fast_assign( \
198 struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ? \
199 ip->i_afp : &ip->i_df; \
200 struct xfs_bmbt_irec r; \
201 \
202 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); \
203 __entry->dev = VFS_I(ip)->i_sb->s_dev; \
204 __entry->ino = ip->i_ino; \
205 __entry->idx = idx; \
206 __entry->startoff = r.br_startoff; \
207 __entry->startblock = r.br_startblock; \
208 __entry->blockcount = r.br_blockcount; \
209 __entry->state = r.br_state; \
210 __entry->bmap_state = state; \
211 __entry->caller_ip = caller_ip; \
212 ), \
213 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " \
214 "offset %lld block %s count %lld flag %d caller %pf", \
215 MAJOR(__entry->dev), MINOR(__entry->dev), \
216 __entry->ino, \
217 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), \
218 (long)__entry->idx, \
219 __entry->startoff, \
220 xfs_fmtfsblock(__entry->startblock), \
221 __entry->blockcount, \
222 __entry->state, \
223 (char *)__entry->caller_ip) \
224)
225
226DEFINE_BMAP_EVENT(xfs_iext_remove); 261DEFINE_BMAP_EVENT(xfs_iext_remove);
227DEFINE_BMAP_EVENT(xfs_bmap_pre_update); 262DEFINE_BMAP_EVENT(xfs_bmap_pre_update);
228DEFINE_BMAP_EVENT(xfs_bmap_post_update); 263DEFINE_BMAP_EVENT(xfs_bmap_post_update);
229DEFINE_BMAP_EVENT(xfs_extlist); 264DEFINE_BMAP_EVENT(xfs_extlist);
230 265
231#define DEFINE_BUF_EVENT(tname) \ 266DECLARE_EVENT_CLASS(xfs_buf_class,
232TRACE_EVENT(tname, \ 267 TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip),
233 TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \ 268 TP_ARGS(bp, caller_ip),
234 TP_ARGS(bp, caller_ip), \ 269 TP_STRUCT__entry(
235 TP_STRUCT__entry( \ 270 __field(dev_t, dev)
236 __field(dev_t, dev) \ 271 __field(xfs_daddr_t, bno)
237 __field(xfs_daddr_t, bno) \ 272 __field(size_t, buffer_length)
238 __field(size_t, buffer_length) \ 273 __field(int, hold)
239 __field(int, hold) \ 274 __field(int, pincount)
240 __field(int, pincount) \ 275 __field(unsigned, lockval)
241 __field(unsigned, lockval) \ 276 __field(unsigned, flags)
242 __field(unsigned, flags) \ 277 __field(unsigned long, caller_ip)
243 __field(unsigned long, caller_ip) \ 278 ),
244 ), \ 279 TP_fast_assign(
245 TP_fast_assign( \ 280 __entry->dev = bp->b_target->bt_dev;
246 __entry->dev = bp->b_target->bt_dev; \ 281 __entry->bno = bp->b_bn;
247 __entry->bno = bp->b_bn; \ 282 __entry->buffer_length = bp->b_buffer_length;
248 __entry->buffer_length = bp->b_buffer_length; \ 283 __entry->hold = atomic_read(&bp->b_hold);
249 __entry->hold = atomic_read(&bp->b_hold); \ 284 __entry->pincount = atomic_read(&bp->b_pin_count);
250 __entry->pincount = atomic_read(&bp->b_pin_count); \ 285 __entry->lockval = xfs_buf_lock_value(bp);
251 __entry->lockval = xfs_buf_lock_value(bp); \ 286 __entry->flags = bp->b_flags;
252 __entry->flags = bp->b_flags; \ 287 __entry->caller_ip = caller_ip;
253 __entry->caller_ip = caller_ip; \ 288 ),
254 ), \ 289 TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
255 TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " \ 290 "lock %d flags %s caller %pf",
256 "lock %d flags %s caller %pf", \ 291 MAJOR(__entry->dev), MINOR(__entry->dev),
257 MAJOR(__entry->dev), MINOR(__entry->dev), \ 292 (unsigned long long)__entry->bno,
258 (unsigned long long)__entry->bno, \ 293 __entry->buffer_length,
259 __entry->buffer_length, \ 294 __entry->hold,
260 __entry->hold, \ 295 __entry->pincount,
261 __entry->pincount, \ 296 __entry->lockval,
262 __entry->lockval, \ 297 __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
263 __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), \ 298 (void *)__entry->caller_ip)
264 (void *)__entry->caller_ip) \
265) 299)
300
301#define DEFINE_BUF_EVENT(name) \
302DEFINE_EVENT(xfs_buf_class, name, \
303 TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \
304 TP_ARGS(bp, caller_ip))
266DEFINE_BUF_EVENT(xfs_buf_init); 305DEFINE_BUF_EVENT(xfs_buf_init);
267DEFINE_BUF_EVENT(xfs_buf_free); 306DEFINE_BUF_EVENT(xfs_buf_free);
268DEFINE_BUF_EVENT(xfs_buf_hold); 307DEFINE_BUF_EVENT(xfs_buf_hold);
@@ -299,41 +338,45 @@ DEFINE_BUF_EVENT(xfs_reset_dqcounts);
299DEFINE_BUF_EVENT(xfs_inode_item_push); 338DEFINE_BUF_EVENT(xfs_inode_item_push);
300 339
301/* pass flags explicitly */ 340/* pass flags explicitly */
302#define DEFINE_BUF_FLAGS_EVENT(tname) \ 341DECLARE_EVENT_CLASS(xfs_buf_flags_class,
303TRACE_EVENT(tname, \ 342 TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip),
304 TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \ 343 TP_ARGS(bp, flags, caller_ip),
305 TP_ARGS(bp, flags, caller_ip), \ 344 TP_STRUCT__entry(
306 TP_STRUCT__entry( \ 345 __field(dev_t, dev)
307 __field(dev_t, dev) \ 346 __field(xfs_daddr_t, bno)
308 __field(xfs_daddr_t, bno) \ 347 __field(size_t, buffer_length)
309 __field(size_t, buffer_length) \ 348 __field(int, hold)
310 __field(int, hold) \ 349 __field(int, pincount)
311 __field(int, pincount) \ 350 __field(unsigned, lockval)
312 __field(unsigned, lockval) \ 351 __field(unsigned, flags)
313 __field(unsigned, flags) \ 352 __field(unsigned long, caller_ip)
314 __field(unsigned long, caller_ip) \ 353 ),
315 ), \ 354 TP_fast_assign(
316 TP_fast_assign( \ 355 __entry->dev = bp->b_target->bt_dev;
317 __entry->dev = bp->b_target->bt_dev; \ 356 __entry->bno = bp->b_bn;
318 __entry->bno = bp->b_bn; \ 357 __entry->buffer_length = bp->b_buffer_length;
319 __entry->buffer_length = bp->b_buffer_length; \ 358 __entry->flags = flags;
320 __entry->flags = flags; \ 359 __entry->hold = atomic_read(&bp->b_hold);
321 __entry->hold = atomic_read(&bp->b_hold); \ 360 __entry->pincount = atomic_read(&bp->b_pin_count);
322 __entry->pincount = atomic_read(&bp->b_pin_count); \ 361 __entry->lockval = xfs_buf_lock_value(bp);
323 __entry->lockval = xfs_buf_lock_value(bp); \ 362 __entry->caller_ip = caller_ip;
324 __entry->caller_ip = caller_ip; \ 363 ),
325 ), \ 364 TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
326 TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " \ 365 "lock %d flags %s caller %pf",
327 "lock %d flags %s caller %pf", \ 366 MAJOR(__entry->dev), MINOR(__entry->dev),
328 MAJOR(__entry->dev), MINOR(__entry->dev), \ 367 (unsigned long long)__entry->bno,
329 (unsigned long long)__entry->bno, \ 368 __entry->buffer_length,
330 __entry->buffer_length, \ 369 __entry->hold,
331 __entry->hold, \ 370 __entry->pincount,
332 __entry->pincount, \ 371 __entry->lockval,
333 __entry->lockval, \ 372 __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
334 __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), \ 373 (void *)__entry->caller_ip)
335 (void *)__entry->caller_ip) \
336) 374)
375
376#define DEFINE_BUF_FLAGS_EVENT(name) \
377DEFINE_EVENT(xfs_buf_flags_class, name, \
378 TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \
379 TP_ARGS(bp, flags, caller_ip))
337DEFINE_BUF_FLAGS_EVENT(xfs_buf_find); 380DEFINE_BUF_FLAGS_EVENT(xfs_buf_find);
338DEFINE_BUF_FLAGS_EVENT(xfs_buf_get); 381DEFINE_BUF_FLAGS_EVENT(xfs_buf_get);
339DEFINE_BUF_FLAGS_EVENT(xfs_buf_read); 382DEFINE_BUF_FLAGS_EVENT(xfs_buf_read);
@@ -376,55 +419,58 @@ TRACE_EVENT(xfs_buf_ioerror,
376 (void *)__entry->caller_ip) 419 (void *)__entry->caller_ip)
377); 420);
378 421
379#define DEFINE_BUF_ITEM_EVENT(tname) \ 422DECLARE_EVENT_CLASS(xfs_buf_item_class,
380TRACE_EVENT(tname, \ 423 TP_PROTO(struct xfs_buf_log_item *bip),
381 TP_PROTO(struct xfs_buf_log_item *bip), \ 424 TP_ARGS(bip),
382 TP_ARGS(bip), \ 425 TP_STRUCT__entry(
383 TP_STRUCT__entry( \ 426 __field(dev_t, dev)
384 __field(dev_t, dev) \ 427 __field(xfs_daddr_t, buf_bno)
385 __field(xfs_daddr_t, buf_bno) \ 428 __field(size_t, buf_len)
386 __field(size_t, buf_len) \ 429 __field(int, buf_hold)
387 __field(int, buf_hold) \ 430 __field(int, buf_pincount)
388 __field(int, buf_pincount) \ 431 __field(int, buf_lockval)
389 __field(int, buf_lockval) \ 432 __field(unsigned, buf_flags)
390 __field(unsigned, buf_flags) \ 433 __field(unsigned, bli_recur)
391 __field(unsigned, bli_recur) \ 434 __field(int, bli_refcount)
392 __field(int, bli_refcount) \ 435 __field(unsigned, bli_flags)
393 __field(unsigned, bli_flags) \ 436 __field(void *, li_desc)
394 __field(void *, li_desc) \ 437 __field(unsigned, li_flags)
395 __field(unsigned, li_flags) \ 438 ),
396 ), \ 439 TP_fast_assign(
397 TP_fast_assign( \ 440 __entry->dev = bip->bli_buf->b_target->bt_dev;
398 __entry->dev = bip->bli_buf->b_target->bt_dev; \ 441 __entry->bli_flags = bip->bli_flags;
399 __entry->bli_flags = bip->bli_flags; \ 442 __entry->bli_recur = bip->bli_recur;
400 __entry->bli_recur = bip->bli_recur; \ 443 __entry->bli_refcount = atomic_read(&bip->bli_refcount);
401 __entry->bli_refcount = atomic_read(&bip->bli_refcount); \ 444 __entry->buf_bno = bip->bli_buf->b_bn;
402 __entry->buf_bno = bip->bli_buf->b_bn; \ 445 __entry->buf_len = bip->bli_buf->b_buffer_length;
403 __entry->buf_len = bip->bli_buf->b_buffer_length; \ 446 __entry->buf_flags = bip->bli_buf->b_flags;
404 __entry->buf_flags = bip->bli_buf->b_flags; \ 447 __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
405 __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); \ 448 __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
406 __entry->buf_pincount = \ 449 __entry->buf_lockval = xfs_buf_lock_value(bip->bli_buf);
407 atomic_read(&bip->bli_buf->b_pin_count); \ 450 __entry->li_desc = bip->bli_item.li_desc;
408 __entry->buf_lockval = xfs_buf_lock_value(bip->bli_buf); \ 451 __entry->li_flags = bip->bli_item.li_flags;
409 __entry->li_desc = bip->bli_item.li_desc; \ 452 ),
410 __entry->li_flags = bip->bli_item.li_flags; \ 453 TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
411 ), \ 454 "lock %d flags %s recur %d refcount %d bliflags %s "
412 TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " \ 455 "lidesc 0x%p liflags %s",
413 "lock %d flags %s recur %d refcount %d bliflags %s " \ 456 MAJOR(__entry->dev), MINOR(__entry->dev),
414 "lidesc 0x%p liflags %s", \ 457 (unsigned long long)__entry->buf_bno,
415 MAJOR(__entry->dev), MINOR(__entry->dev), \ 458 __entry->buf_len,
416 (unsigned long long)__entry->buf_bno, \ 459 __entry->buf_hold,
417 __entry->buf_len, \ 460 __entry->buf_pincount,
418 __entry->buf_hold, \ 461 __entry->buf_lockval,
419 __entry->buf_pincount, \ 462 __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS),
420 __entry->buf_lockval, \ 463 __entry->bli_recur,
421 __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS), \ 464 __entry->bli_refcount,
422 __entry->bli_recur, \ 465 __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS),
423 __entry->bli_refcount, \ 466 __entry->li_desc,
424 __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS), \ 467 __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS))
425 __entry->li_desc, \
426 __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS)) \
427) 468)
469
470#define DEFINE_BUF_ITEM_EVENT(name) \
471DEFINE_EVENT(xfs_buf_item_class, name, \
472 TP_PROTO(struct xfs_buf_log_item *bip), \
473 TP_ARGS(bip))
428DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); 474DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
429DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); 475DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
430DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); 476DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
@@ -437,6 +483,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
437DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); 483DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
438DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); 484DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
439DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); 485DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
486DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
440DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); 487DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
441DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); 488DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
442DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); 489DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
@@ -450,78 +497,90 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
450DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); 497DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
451DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); 498DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
452 499
500DECLARE_EVENT_CLASS(xfs_lock_class,
501 TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
502 unsigned long caller_ip),
503 TP_ARGS(ip, lock_flags, caller_ip),
504 TP_STRUCT__entry(
505 __field(dev_t, dev)
506 __field(xfs_ino_t, ino)
507 __field(int, lock_flags)
508 __field(unsigned long, caller_ip)
509 ),
510 TP_fast_assign(
511 __entry->dev = VFS_I(ip)->i_sb->s_dev;
512 __entry->ino = ip->i_ino;
513 __entry->lock_flags = lock_flags;
514 __entry->caller_ip = caller_ip;
515 ),
516 TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf",
517 MAJOR(__entry->dev), MINOR(__entry->dev),
518 __entry->ino,
519 __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS),
520 (void *)__entry->caller_ip)
521)
522
453#define DEFINE_LOCK_EVENT(name) \ 523#define DEFINE_LOCK_EVENT(name) \
454TRACE_EVENT(name, \ 524DEFINE_EVENT(xfs_lock_class, name, \
455 TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \ 525 TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \
456 unsigned long caller_ip), \ 526 unsigned long caller_ip), \
457 TP_ARGS(ip, lock_flags, caller_ip), \ 527 TP_ARGS(ip, lock_flags, caller_ip))
458 TP_STRUCT__entry( \
459 __field(dev_t, dev) \
460 __field(xfs_ino_t, ino) \
461 __field(int, lock_flags) \
462 __field(unsigned long, caller_ip) \
463 ), \
464 TP_fast_assign( \
465 __entry->dev = VFS_I(ip)->i_sb->s_dev; \
466 __entry->ino = ip->i_ino; \
467 __entry->lock_flags = lock_flags; \
468 __entry->caller_ip = caller_ip; \
469 ), \
470 TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf", \
471 MAJOR(__entry->dev), MINOR(__entry->dev), \
472 __entry->ino, \
473 __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), \
474 (void *)__entry->caller_ip) \
475)
476
477DEFINE_LOCK_EVENT(xfs_ilock); 528DEFINE_LOCK_EVENT(xfs_ilock);
478DEFINE_LOCK_EVENT(xfs_ilock_nowait); 529DEFINE_LOCK_EVENT(xfs_ilock_nowait);
479DEFINE_LOCK_EVENT(xfs_ilock_demote); 530DEFINE_LOCK_EVENT(xfs_ilock_demote);
480DEFINE_LOCK_EVENT(xfs_iunlock); 531DEFINE_LOCK_EVENT(xfs_iunlock);
481 532
533DECLARE_EVENT_CLASS(xfs_iget_class,
534 TP_PROTO(struct xfs_inode *ip),
535 TP_ARGS(ip),
536 TP_STRUCT__entry(
537 __field(dev_t, dev)
538 __field(xfs_ino_t, ino)
539 ),
540 TP_fast_assign(
541 __entry->dev = VFS_I(ip)->i_sb->s_dev;
542 __entry->ino = ip->i_ino;
543 ),
544 TP_printk("dev %d:%d ino 0x%llx",
545 MAJOR(__entry->dev), MINOR(__entry->dev),
546 __entry->ino)
547)
548
482#define DEFINE_IGET_EVENT(name) \ 549#define DEFINE_IGET_EVENT(name) \
483TRACE_EVENT(name, \ 550DEFINE_EVENT(xfs_iget_class, name, \
484 TP_PROTO(struct xfs_inode *ip), \ 551 TP_PROTO(struct xfs_inode *ip), \
485 TP_ARGS(ip), \ 552 TP_ARGS(ip))
486 TP_STRUCT__entry( \
487 __field(dev_t, dev) \
488 __field(xfs_ino_t, ino) \
489 ), \
490 TP_fast_assign( \
491 __entry->dev = VFS_I(ip)->i_sb->s_dev; \
492 __entry->ino = ip->i_ino; \
493 ), \
494 TP_printk("dev %d:%d ino 0x%llx", \
495 MAJOR(__entry->dev), MINOR(__entry->dev), \
496 __entry->ino) \
497)
498DEFINE_IGET_EVENT(xfs_iget_skip); 553DEFINE_IGET_EVENT(xfs_iget_skip);
499DEFINE_IGET_EVENT(xfs_iget_reclaim); 554DEFINE_IGET_EVENT(xfs_iget_reclaim);
500DEFINE_IGET_EVENT(xfs_iget_found); 555DEFINE_IGET_EVENT(xfs_iget_found);
501DEFINE_IGET_EVENT(xfs_iget_alloc); 556DEFINE_IGET_EVENT(xfs_iget_alloc);
502 557
558DECLARE_EVENT_CLASS(xfs_inode_class,
559 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
560 TP_ARGS(ip, caller_ip),
561 TP_STRUCT__entry(
562 __field(dev_t, dev)
563 __field(xfs_ino_t, ino)
564 __field(int, count)
565 __field(unsigned long, caller_ip)
566 ),
567 TP_fast_assign(
568 __entry->dev = VFS_I(ip)->i_sb->s_dev;
569 __entry->ino = ip->i_ino;
570 __entry->count = atomic_read(&VFS_I(ip)->i_count);
571 __entry->caller_ip = caller_ip;
572 ),
573 TP_printk("dev %d:%d ino 0x%llx count %d caller %pf",
574 MAJOR(__entry->dev), MINOR(__entry->dev),
575 __entry->ino,
576 __entry->count,
577 (char *)__entry->caller_ip)
578)
579
503#define DEFINE_INODE_EVENT(name) \ 580#define DEFINE_INODE_EVENT(name) \
504TRACE_EVENT(name, \ 581DEFINE_EVENT(xfs_inode_class, name, \
505 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ 582 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
506 TP_ARGS(ip, caller_ip), \ 583 TP_ARGS(ip, caller_ip))
507 TP_STRUCT__entry( \
508 __field(dev_t, dev) \
509 __field(xfs_ino_t, ino) \
510 __field(int, count) \
511 __field(unsigned long, caller_ip) \
512 ), \
513 TP_fast_assign( \
514 __entry->dev = VFS_I(ip)->i_sb->s_dev; \
515 __entry->ino = ip->i_ino; \
516 __entry->count = atomic_read(&VFS_I(ip)->i_count); \
517 __entry->caller_ip = caller_ip; \
518 ), \
519 TP_printk("dev %d:%d ino 0x%llx count %d caller %pf", \
520 MAJOR(__entry->dev), MINOR(__entry->dev), \
521 __entry->ino, \
522 __entry->count, \
523 (char *)__entry->caller_ip) \
524)
525DEFINE_INODE_EVENT(xfs_ihold); 584DEFINE_INODE_EVENT(xfs_ihold);
526DEFINE_INODE_EVENT(xfs_irele); 585DEFINE_INODE_EVENT(xfs_irele);
527/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */ 586/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */
@@ -529,55 +588,59 @@ DEFINE_INODE_EVENT(xfs_inode);
529#define xfs_itrace_entry(ip) \ 588#define xfs_itrace_entry(ip) \
530 trace_xfs_inode(ip, _THIS_IP_) 589 trace_xfs_inode(ip, _THIS_IP_)
531 590
532#define DEFINE_DQUOT_EVENT(tname) \ 591DECLARE_EVENT_CLASS(xfs_dquot_class,
533TRACE_EVENT(tname, \ 592 TP_PROTO(struct xfs_dquot *dqp),
534 TP_PROTO(struct xfs_dquot *dqp), \ 593 TP_ARGS(dqp),
535 TP_ARGS(dqp), \ 594 TP_STRUCT__entry(
536 TP_STRUCT__entry( \ 595 __field(dev_t, dev)
537 __field(dev_t, dev) \ 596 __field(u32, id)
538 __field(__be32, id) \ 597 __field(unsigned, flags)
539 __field(unsigned, flags) \ 598 __field(unsigned, nrefs)
540 __field(unsigned, nrefs) \ 599 __field(unsigned long long, res_bcount)
541 __field(unsigned long long, res_bcount) \ 600 __field(unsigned long long, bcount)
542 __field(unsigned long long, bcount) \ 601 __field(unsigned long long, icount)
543 __field(unsigned long long, icount) \ 602 __field(unsigned long long, blk_hardlimit)
544 __field(unsigned long long, blk_hardlimit) \ 603 __field(unsigned long long, blk_softlimit)
545 __field(unsigned long long, blk_softlimit) \ 604 __field(unsigned long long, ino_hardlimit)
546 __field(unsigned long long, ino_hardlimit) \ 605 __field(unsigned long long, ino_softlimit)
547 __field(unsigned long long, ino_softlimit) \
548 ), \
549 TP_fast_assign( \
550 __entry->dev = dqp->q_mount->m_super->s_dev; \
551 __entry->id = dqp->q_core.d_id; \
552 __entry->flags = dqp->dq_flags; \
553 __entry->nrefs = dqp->q_nrefs; \
554 __entry->res_bcount = dqp->q_res_bcount; \
555 __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); \
556 __entry->icount = be64_to_cpu(dqp->q_core.d_icount); \
557 __entry->blk_hardlimit = \
558 be64_to_cpu(dqp->q_core.d_blk_hardlimit); \
559 __entry->blk_softlimit = \
560 be64_to_cpu(dqp->q_core.d_blk_softlimit); \
561 __entry->ino_hardlimit = \
562 be64_to_cpu(dqp->q_core.d_ino_hardlimit); \
563 __entry->ino_softlimit = \
564 be64_to_cpu(dqp->q_core.d_ino_softlimit); \
565 ), \ 606 ), \
566 TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " \ 607 TP_fast_assign(
567 "bcnt 0x%llx [hard 0x%llx | soft 0x%llx] " \ 608 __entry->dev = dqp->q_mount->m_super->s_dev;
568 "icnt 0x%llx [hard 0x%llx | soft 0x%llx]", \ 609 __entry->id = be32_to_cpu(dqp->q_core.d_id);
569 MAJOR(__entry->dev), MINOR(__entry->dev), \ 610 __entry->flags = dqp->dq_flags;
570 be32_to_cpu(__entry->id), \ 611 __entry->nrefs = dqp->q_nrefs;
571 __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), \ 612 __entry->res_bcount = dqp->q_res_bcount;
572 __entry->nrefs, \ 613 __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount);
573 __entry->res_bcount, \ 614 __entry->icount = be64_to_cpu(dqp->q_core.d_icount);
574 __entry->bcount, \ 615 __entry->blk_hardlimit =
575 __entry->blk_hardlimit, \ 616 be64_to_cpu(dqp->q_core.d_blk_hardlimit);
576 __entry->blk_softlimit, \ 617 __entry->blk_softlimit =
577 __entry->icount, \ 618 be64_to_cpu(dqp->q_core.d_blk_softlimit);
578 __entry->ino_hardlimit, \ 619 __entry->ino_hardlimit =
579 __entry->ino_softlimit) \ 620 be64_to_cpu(dqp->q_core.d_ino_hardlimit);
621 __entry->ino_softlimit =
622 be64_to_cpu(dqp->q_core.d_ino_softlimit);
623 ),
624 TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx "
625 "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
626 "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
627 MAJOR(__entry->dev), MINOR(__entry->dev),
628 __entry->id,
629 __print_flags(__entry->flags, "|", XFS_DQ_FLAGS),
630 __entry->nrefs,
631 __entry->res_bcount,
632 __entry->bcount,
633 __entry->blk_hardlimit,
634 __entry->blk_softlimit,
635 __entry->icount,
636 __entry->ino_hardlimit,
637 __entry->ino_softlimit)
580) 638)
639
640#define DEFINE_DQUOT_EVENT(name) \
641DEFINE_EVENT(xfs_dquot_class, name, \
642 TP_PROTO(struct xfs_dquot *dqp), \
643 TP_ARGS(dqp))
581DEFINE_DQUOT_EVENT(xfs_dqadjust); 644DEFINE_DQUOT_EVENT(xfs_dqadjust);
582DEFINE_DQUOT_EVENT(xfs_dqshake_dirty); 645DEFINE_DQUOT_EVENT(xfs_dqshake_dirty);
583DEFINE_DQUOT_EVENT(xfs_dqshake_unlink); 646DEFINE_DQUOT_EVENT(xfs_dqshake_unlink);
@@ -610,72 +673,75 @@ DEFINE_DQUOT_EVENT(xfs_dqflush_done);
610DEFINE_IGET_EVENT(xfs_dquot_dqalloc); 673DEFINE_IGET_EVENT(xfs_dquot_dqalloc);
611DEFINE_IGET_EVENT(xfs_dquot_dqdetach); 674DEFINE_IGET_EVENT(xfs_dquot_dqdetach);
612 675
676DECLARE_EVENT_CLASS(xfs_loggrant_class,
677 TP_PROTO(struct log *log, struct xlog_ticket *tic),
678 TP_ARGS(log, tic),
679 TP_STRUCT__entry(
680 __field(dev_t, dev)
681 __field(unsigned, trans_type)
682 __field(char, ocnt)
683 __field(char, cnt)
684 __field(int, curr_res)
685 __field(int, unit_res)
686 __field(unsigned int, flags)
687 __field(void *, reserve_headq)
688 __field(void *, write_headq)
689 __field(int, grant_reserve_cycle)
690 __field(int, grant_reserve_bytes)
691 __field(int, grant_write_cycle)
692 __field(int, grant_write_bytes)
693 __field(int, curr_cycle)
694 __field(int, curr_block)
695 __field(xfs_lsn_t, tail_lsn)
696 ),
697 TP_fast_assign(
698 __entry->dev = log->l_mp->m_super->s_dev;
699 __entry->trans_type = tic->t_trans_type;
700 __entry->ocnt = tic->t_ocnt;
701 __entry->cnt = tic->t_cnt;
702 __entry->curr_res = tic->t_curr_res;
703 __entry->unit_res = tic->t_unit_res;
704 __entry->flags = tic->t_flags;
705 __entry->reserve_headq = log->l_reserve_headq;
706 __entry->write_headq = log->l_write_headq;
707 __entry->grant_reserve_cycle = log->l_grant_reserve_cycle;
708 __entry->grant_reserve_bytes = log->l_grant_reserve_bytes;
709 __entry->grant_write_cycle = log->l_grant_write_cycle;
710 __entry->grant_write_bytes = log->l_grant_write_bytes;
711 __entry->curr_cycle = log->l_curr_cycle;
712 __entry->curr_block = log->l_curr_block;
713 __entry->tail_lsn = log->l_tail_lsn;
714 ),
715 TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
716 "t_unit_res %u t_flags %s reserve_headq 0x%p "
717 "write_headq 0x%p grant_reserve_cycle %d "
718 "grant_reserve_bytes %d grant_write_cycle %d "
719 "grant_write_bytes %d curr_cycle %d curr_block %d "
720 "tail_cycle %d tail_block %d",
721 MAJOR(__entry->dev), MINOR(__entry->dev),
722 __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES),
723 __entry->ocnt,
724 __entry->cnt,
725 __entry->curr_res,
726 __entry->unit_res,
727 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
728 __entry->reserve_headq,
729 __entry->write_headq,
730 __entry->grant_reserve_cycle,
731 __entry->grant_reserve_bytes,
732 __entry->grant_write_cycle,
733 __entry->grant_write_bytes,
734 __entry->curr_cycle,
735 __entry->curr_block,
736 CYCLE_LSN(__entry->tail_lsn),
737 BLOCK_LSN(__entry->tail_lsn)
738 )
739)
613 740
614#define DEFINE_LOGGRANT_EVENT(tname) \ 741#define DEFINE_LOGGRANT_EVENT(name) \
615TRACE_EVENT(tname, \ 742DEFINE_EVENT(xfs_loggrant_class, name, \
616 TP_PROTO(struct log *log, struct xlog_ticket *tic), \ 743 TP_PROTO(struct log *log, struct xlog_ticket *tic), \
617 TP_ARGS(log, tic), \ 744 TP_ARGS(log, tic))
618 TP_STRUCT__entry( \
619 __field(dev_t, dev) \
620 __field(unsigned, trans_type) \
621 __field(char, ocnt) \
622 __field(char, cnt) \
623 __field(int, curr_res) \
624 __field(int, unit_res) \
625 __field(unsigned int, flags) \
626 __field(void *, reserve_headq) \
627 __field(void *, write_headq) \
628 __field(int, grant_reserve_cycle) \
629 __field(int, grant_reserve_bytes) \
630 __field(int, grant_write_cycle) \
631 __field(int, grant_write_bytes) \
632 __field(int, curr_cycle) \
633 __field(int, curr_block) \
634 __field(xfs_lsn_t, tail_lsn) \
635 ), \
636 TP_fast_assign( \
637 __entry->dev = log->l_mp->m_super->s_dev; \
638 __entry->trans_type = tic->t_trans_type; \
639 __entry->ocnt = tic->t_ocnt; \
640 __entry->cnt = tic->t_cnt; \
641 __entry->curr_res = tic->t_curr_res; \
642 __entry->unit_res = tic->t_unit_res; \
643 __entry->flags = tic->t_flags; \
644 __entry->reserve_headq = log->l_reserve_headq; \
645 __entry->write_headq = log->l_write_headq; \
646 __entry->grant_reserve_cycle = log->l_grant_reserve_cycle; \
647 __entry->grant_reserve_bytes = log->l_grant_reserve_bytes; \
648 __entry->grant_write_cycle = log->l_grant_write_cycle; \
649 __entry->grant_write_bytes = log->l_grant_write_bytes; \
650 __entry->curr_cycle = log->l_curr_cycle; \
651 __entry->curr_block = log->l_curr_block; \
652 __entry->tail_lsn = log->l_tail_lsn; \
653 ), \
654 TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " \
655 "t_unit_res %u t_flags %s reserve_headq 0x%p " \
656 "write_headq 0x%p grant_reserve_cycle %d " \
657 "grant_reserve_bytes %d grant_write_cycle %d " \
658 "grant_write_bytes %d curr_cycle %d curr_block %d " \
659 "tail_cycle %d tail_block %d", \
660 MAJOR(__entry->dev), MINOR(__entry->dev), \
661 __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES), \
662 __entry->ocnt, \
663 __entry->cnt, \
664 __entry->curr_res, \
665 __entry->unit_res, \
666 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), \
667 __entry->reserve_headq, \
668 __entry->write_headq, \
669 __entry->grant_reserve_cycle, \
670 __entry->grant_reserve_bytes, \
671 __entry->grant_write_cycle, \
672 __entry->grant_write_bytes, \
673 __entry->curr_cycle, \
674 __entry->curr_block, \
675 CYCLE_LSN(__entry->tail_lsn), \
676 BLOCK_LSN(__entry->tail_lsn) \
677 ) \
678)
679DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); 745DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
680DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); 746DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
681DEFINE_LOGGRANT_EVENT(xfs_log_reserve); 747DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
@@ -815,7 +881,7 @@ TRACE_EVENT(name, \
815 ), \ 881 ), \
816 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ 882 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
817 "offset 0x%llx count %zd flags %s " \ 883 "offset 0x%llx count %zd flags %s " \
818 "startoff 0x%llx startblock 0x%llx blockcount 0x%llx", \ 884 "startoff 0x%llx startblock %lld blockcount 0x%llx", \
819 MAJOR(__entry->dev), MINOR(__entry->dev), \ 885 MAJOR(__entry->dev), MINOR(__entry->dev), \
820 __entry->ino, \ 886 __entry->ino, \
821 __entry->size, \ 887 __entry->size, \
@@ -824,7 +890,7 @@ TRACE_EVENT(name, \
824 __entry->count, \ 890 __entry->count, \
825 __print_flags(__entry->flags, "|", BMAPI_FLAGS), \ 891 __print_flags(__entry->flags, "|", BMAPI_FLAGS), \
826 __entry->startoff, \ 892 __entry->startoff, \
827 __entry->startblock, \ 893 (__int64_t)__entry->startblock, \
828 __entry->blockcount) \ 894 __entry->blockcount) \
829) 895)
830DEFINE_IOMAP_EVENT(xfs_iomap_enter); 896DEFINE_IOMAP_EVENT(xfs_iomap_enter);
@@ -897,28 +963,32 @@ TRACE_EVENT(xfs_itruncate_start,
897 __entry->toss_finish) 963 __entry->toss_finish)
898); 964);
899 965
966DECLARE_EVENT_CLASS(xfs_itrunc_class,
967 TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
968 TP_ARGS(ip, new_size),
969 TP_STRUCT__entry(
970 __field(dev_t, dev)
971 __field(xfs_ino_t, ino)
972 __field(xfs_fsize_t, size)
973 __field(xfs_fsize_t, new_size)
974 ),
975 TP_fast_assign(
976 __entry->dev = VFS_I(ip)->i_sb->s_dev;
977 __entry->ino = ip->i_ino;
978 __entry->size = ip->i_d.di_size;
979 __entry->new_size = new_size;
980 ),
981 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx",
982 MAJOR(__entry->dev), MINOR(__entry->dev),
983 __entry->ino,
984 __entry->size,
985 __entry->new_size)
986)
987
900#define DEFINE_ITRUNC_EVENT(name) \ 988#define DEFINE_ITRUNC_EVENT(name) \
901TRACE_EVENT(name, \ 989DEFINE_EVENT(xfs_itrunc_class, name, \
902 TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ 990 TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
903 TP_ARGS(ip, new_size), \ 991 TP_ARGS(ip, new_size))
904 TP_STRUCT__entry( \
905 __field(dev_t, dev) \
906 __field(xfs_ino_t, ino) \
907 __field(xfs_fsize_t, size) \
908 __field(xfs_fsize_t, new_size) \
909 ), \
910 TP_fast_assign( \
911 __entry->dev = VFS_I(ip)->i_sb->s_dev; \
912 __entry->ino = ip->i_ino; \
913 __entry->size = ip->i_d.di_size; \
914 __entry->new_size = new_size; \
915 ), \
916 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx", \
917 MAJOR(__entry->dev), MINOR(__entry->dev), \
918 __entry->ino, \
919 __entry->size, \
920 __entry->new_size) \
921)
922DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_start); 992DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_start);
923DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_end); 993DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_end);
924 994
@@ -1037,28 +1107,28 @@ TRACE_EVENT(xfs_alloc_unbusy,
1037 1107
1038TRACE_EVENT(xfs_alloc_busysearch, 1108TRACE_EVENT(xfs_alloc_busysearch,
1039 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1109 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
1040 xfs_extlen_t len, int found), 1110 xfs_extlen_t len, xfs_lsn_t lsn),
1041 TP_ARGS(mp, agno, agbno, len, found), 1111 TP_ARGS(mp, agno, agbno, len, lsn),
1042 TP_STRUCT__entry( 1112 TP_STRUCT__entry(
1043 __field(dev_t, dev) 1113 __field(dev_t, dev)
1044 __field(xfs_agnumber_t, agno) 1114 __field(xfs_agnumber_t, agno)
1045 __field(xfs_agblock_t, agbno) 1115 __field(xfs_agblock_t, agbno)
1046 __field(xfs_extlen_t, len) 1116 __field(xfs_extlen_t, len)
1047 __field(int, found) 1117 __field(xfs_lsn_t, lsn)
1048 ), 1118 ),
1049 TP_fast_assign( 1119 TP_fast_assign(
1050 __entry->dev = mp->m_super->s_dev; 1120 __entry->dev = mp->m_super->s_dev;
1051 __entry->agno = agno; 1121 __entry->agno = agno;
1052 __entry->agbno = agbno; 1122 __entry->agbno = agbno;
1053 __entry->len = len; 1123 __entry->len = len;
1054 __entry->found = found; 1124 __entry->lsn = lsn;
1055 ), 1125 ),
1056 TP_printk("dev %d:%d agno %u agbno %u len %u %s", 1126 TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx",
1057 MAJOR(__entry->dev), MINOR(__entry->dev), 1127 MAJOR(__entry->dev), MINOR(__entry->dev),
1058 __entry->agno, 1128 __entry->agno,
1059 __entry->agbno, 1129 __entry->agbno,
1060 __entry->len, 1130 __entry->len,
1061 __print_symbolic(__entry->found, XFS_BUSY_STATES)) 1131 __entry->lsn)
1062); 1132);
1063 1133
1064TRACE_EVENT(xfs_agf, 1134TRACE_EVENT(xfs_agf,
@@ -1152,77 +1222,80 @@ TRACE_EVENT(xfs_free_extent,
1152 1222
1153); 1223);
1154 1224
1155#define DEFINE_ALLOC_EVENT(name) \ 1225DECLARE_EVENT_CLASS(xfs_alloc_class,
1156TRACE_EVENT(name, \ 1226 TP_PROTO(struct xfs_alloc_arg *args),
1157 TP_PROTO(struct xfs_alloc_arg *args), \ 1227 TP_ARGS(args),
1158 TP_ARGS(args), \ 1228 TP_STRUCT__entry(
1159 TP_STRUCT__entry( \ 1229 __field(dev_t, dev)
1160 __field(dev_t, dev) \ 1230 __field(xfs_agnumber_t, agno)
1161 __field(xfs_agnumber_t, agno) \ 1231 __field(xfs_agblock_t, agbno)
1162 __field(xfs_agblock_t, agbno) \ 1232 __field(xfs_extlen_t, minlen)
1163 __field(xfs_extlen_t, minlen) \ 1233 __field(xfs_extlen_t, maxlen)
1164 __field(xfs_extlen_t, maxlen) \ 1234 __field(xfs_extlen_t, mod)
1165 __field(xfs_extlen_t, mod) \ 1235 __field(xfs_extlen_t, prod)
1166 __field(xfs_extlen_t, prod) \ 1236 __field(xfs_extlen_t, minleft)
1167 __field(xfs_extlen_t, minleft) \ 1237 __field(xfs_extlen_t, total)
1168 __field(xfs_extlen_t, total) \ 1238 __field(xfs_extlen_t, alignment)
1169 __field(xfs_extlen_t, alignment) \ 1239 __field(xfs_extlen_t, minalignslop)
1170 __field(xfs_extlen_t, minalignslop) \ 1240 __field(xfs_extlen_t, len)
1171 __field(xfs_extlen_t, len) \ 1241 __field(short, type)
1172 __field(short, type) \ 1242 __field(short, otype)
1173 __field(short, otype) \ 1243 __field(char, wasdel)
1174 __field(char, wasdel) \ 1244 __field(char, wasfromfl)
1175 __field(char, wasfromfl) \ 1245 __field(char, isfl)
1176 __field(char, isfl) \ 1246 __field(char, userdata)
1177 __field(char, userdata) \ 1247 __field(xfs_fsblock_t, firstblock)
1178 __field(xfs_fsblock_t, firstblock) \ 1248 ),
1179 ), \ 1249 TP_fast_assign(
1180 TP_fast_assign( \ 1250 __entry->dev = args->mp->m_super->s_dev;
1181 __entry->dev = args->mp->m_super->s_dev; \ 1251 __entry->agno = args->agno;
1182 __entry->agno = args->agno; \ 1252 __entry->agbno = args->agbno;
1183 __entry->agbno = args->agbno; \ 1253 __entry->minlen = args->minlen;
1184 __entry->minlen = args->minlen; \ 1254 __entry->maxlen = args->maxlen;
1185 __entry->maxlen = args->maxlen; \ 1255 __entry->mod = args->mod;
1186 __entry->mod = args->mod; \ 1256 __entry->prod = args->prod;
1187 __entry->prod = args->prod; \ 1257 __entry->minleft = args->minleft;
1188 __entry->minleft = args->minleft; \ 1258 __entry->total = args->total;
1189 __entry->total = args->total; \ 1259 __entry->alignment = args->alignment;
1190 __entry->alignment = args->alignment; \ 1260 __entry->minalignslop = args->minalignslop;
1191 __entry->minalignslop = args->minalignslop; \ 1261 __entry->len = args->len;
1192 __entry->len = args->len; \ 1262 __entry->type = args->type;
1193 __entry->type = args->type; \ 1263 __entry->otype = args->otype;
1194 __entry->otype = args->otype; \ 1264 __entry->wasdel = args->wasdel;
1195 __entry->wasdel = args->wasdel; \ 1265 __entry->wasfromfl = args->wasfromfl;
1196 __entry->wasfromfl = args->wasfromfl; \ 1266 __entry->isfl = args->isfl;
1197 __entry->isfl = args->isfl; \ 1267 __entry->userdata = args->userdata;
1198 __entry->userdata = args->userdata; \ 1268 __entry->firstblock = args->firstblock;
1199 __entry->firstblock = args->firstblock; \ 1269 ),
1200 ), \ 1270 TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
1201 TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " \ 1271 "prod %u minleft %u total %u alignment %u minalignslop %u "
1202 "prod %u minleft %u total %u alignment %u minalignslop %u " \ 1272 "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d "
1203 "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " \ 1273 "userdata %d firstblock 0x%llx",
1204 "userdata %d firstblock 0x%llx", \ 1274 MAJOR(__entry->dev), MINOR(__entry->dev),
1205 MAJOR(__entry->dev), MINOR(__entry->dev), \ 1275 __entry->agno,
1206 __entry->agno, \ 1276 __entry->agbno,
1207 __entry->agbno, \ 1277 __entry->minlen,
1208 __entry->minlen, \ 1278 __entry->maxlen,
1209 __entry->maxlen, \ 1279 __entry->mod,
1210 __entry->mod, \ 1280 __entry->prod,
1211 __entry->prod, \ 1281 __entry->minleft,
1212 __entry->minleft, \ 1282 __entry->total,
1213 __entry->total, \ 1283 __entry->alignment,
1214 __entry->alignment, \ 1284 __entry->minalignslop,
1215 __entry->minalignslop, \ 1285 __entry->len,
1216 __entry->len, \ 1286 __print_symbolic(__entry->type, XFS_ALLOC_TYPES),
1217 __print_symbolic(__entry->type, XFS_ALLOC_TYPES), \ 1287 __print_symbolic(__entry->otype, XFS_ALLOC_TYPES),
1218 __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), \ 1288 __entry->wasdel,
1219 __entry->wasdel, \ 1289 __entry->wasfromfl,
1220 __entry->wasfromfl, \ 1290 __entry->isfl,
1221 __entry->isfl, \ 1291 __entry->userdata,
1222 __entry->userdata, \ 1292 __entry->firstblock)
1223 __entry->firstblock) \
1224) 1293)
1225 1294
1295#define DEFINE_ALLOC_EVENT(name) \
1296DEFINE_EVENT(xfs_alloc_class, name, \
1297 TP_PROTO(struct xfs_alloc_arg *args), \
1298 TP_ARGS(args))
1226DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); 1299DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
1227DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); 1300DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
1228DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); 1301DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
@@ -1245,92 +1318,100 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
1245DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed); 1318DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
1246DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed); 1319DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
1247 1320
1248#define DEFINE_DIR2_TRACE(tname) \ 1321DECLARE_EVENT_CLASS(xfs_dir2_class,
1249TRACE_EVENT(tname, \ 1322 TP_PROTO(struct xfs_da_args *args),
1323 TP_ARGS(args),
1324 TP_STRUCT__entry(
1325 __field(dev_t, dev)
1326 __field(xfs_ino_t, ino)
1327 __dynamic_array(char, name, args->namelen)
1328 __field(int, namelen)
1329 __field(xfs_dahash_t, hashval)
1330 __field(xfs_ino_t, inumber)
1331 __field(int, op_flags)
1332 ),
1333 TP_fast_assign(
1334 __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
1335 __entry->ino = args->dp->i_ino;
1336 if (args->namelen)
1337 memcpy(__get_str(name), args->name, args->namelen);
1338 __entry->namelen = args->namelen;
1339 __entry->hashval = args->hashval;
1340 __entry->inumber = args->inumber;
1341 __entry->op_flags = args->op_flags;
1342 ),
1343 TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x "
1344 "inumber 0x%llx op_flags %s",
1345 MAJOR(__entry->dev), MINOR(__entry->dev),
1346 __entry->ino,
1347 __entry->namelen,
1348 __entry->namelen ? __get_str(name) : NULL,
1349 __entry->namelen,
1350 __entry->hashval,
1351 __entry->inumber,
1352 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
1353)
1354
1355#define DEFINE_DIR2_EVENT(name) \
1356DEFINE_EVENT(xfs_dir2_class, name, \
1250 TP_PROTO(struct xfs_da_args *args), \ 1357 TP_PROTO(struct xfs_da_args *args), \
1251 TP_ARGS(args), \ 1358 TP_ARGS(args))
1252 TP_STRUCT__entry( \ 1359DEFINE_DIR2_EVENT(xfs_dir2_sf_addname);
1253 __field(dev_t, dev) \ 1360DEFINE_DIR2_EVENT(xfs_dir2_sf_create);
1254 __field(xfs_ino_t, ino) \ 1361DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup);
1255 __dynamic_array(char, name, args->namelen) \ 1362DEFINE_DIR2_EVENT(xfs_dir2_sf_replace);
1256 __field(int, namelen) \ 1363DEFINE_DIR2_EVENT(xfs_dir2_sf_removename);
1257 __field(xfs_dahash_t, hashval) \ 1364DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4);
1258 __field(xfs_ino_t, inumber) \ 1365DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8);
1259 __field(int, op_flags) \ 1366DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block);
1260 ), \ 1367DEFINE_DIR2_EVENT(xfs_dir2_block_addname);
1261 TP_fast_assign( \ 1368DEFINE_DIR2_EVENT(xfs_dir2_block_lookup);
1262 __entry->dev = VFS_I(args->dp)->i_sb->s_dev; \ 1369DEFINE_DIR2_EVENT(xfs_dir2_block_replace);
1263 __entry->ino = args->dp->i_ino; \ 1370DEFINE_DIR2_EVENT(xfs_dir2_block_removename);
1264 if (args->namelen) \ 1371DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf);
1265 memcpy(__get_str(name), args->name, args->namelen); \ 1372DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf);
1266 __entry->namelen = args->namelen; \ 1373DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname);
1267 __entry->hashval = args->hashval; \ 1374DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup);
1268 __entry->inumber = args->inumber; \ 1375DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace);
1269 __entry->op_flags = args->op_flags; \ 1376DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename);
1270 ), \ 1377DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block);
1271 TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x " \ 1378DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node);
1272 "inumber 0x%llx op_flags %s", \ 1379DEFINE_DIR2_EVENT(xfs_dir2_node_addname);
1273 MAJOR(__entry->dev), MINOR(__entry->dev), \ 1380DEFINE_DIR2_EVENT(xfs_dir2_node_lookup);
1274 __entry->ino, \ 1381DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
1275 __entry->namelen, \ 1382DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
1276 __entry->namelen ? __get_str(name) : NULL, \ 1383DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
1277 __entry->namelen, \ 1384
1278 __entry->hashval, \ 1385DECLARE_EVENT_CLASS(xfs_dir2_space_class,
1279 __entry->inumber, \ 1386 TP_PROTO(struct xfs_da_args *args, int idx),
1280 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) \ 1387 TP_ARGS(args, idx),
1388 TP_STRUCT__entry(
1389 __field(dev_t, dev)
1390 __field(xfs_ino_t, ino)
1391 __field(int, op_flags)
1392 __field(int, idx)
1393 ),
1394 TP_fast_assign(
1395 __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
1396 __entry->ino = args->dp->i_ino;
1397 __entry->op_flags = args->op_flags;
1398 __entry->idx = idx;
1399 ),
1400 TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d",
1401 MAJOR(__entry->dev), MINOR(__entry->dev),
1402 __entry->ino,
1403 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
1404 __entry->idx)
1281) 1405)
1282DEFINE_DIR2_TRACE(xfs_dir2_sf_addname); 1406
1283DEFINE_DIR2_TRACE(xfs_dir2_sf_create); 1407#define DEFINE_DIR2_SPACE_EVENT(name) \
1284DEFINE_DIR2_TRACE(xfs_dir2_sf_lookup); 1408DEFINE_EVENT(xfs_dir2_space_class, name, \
1285DEFINE_DIR2_TRACE(xfs_dir2_sf_replace);
1286DEFINE_DIR2_TRACE(xfs_dir2_sf_removename);
1287DEFINE_DIR2_TRACE(xfs_dir2_sf_toino4);
1288DEFINE_DIR2_TRACE(xfs_dir2_sf_toino8);
1289DEFINE_DIR2_TRACE(xfs_dir2_sf_to_block);
1290DEFINE_DIR2_TRACE(xfs_dir2_block_addname);
1291DEFINE_DIR2_TRACE(xfs_dir2_block_lookup);
1292DEFINE_DIR2_TRACE(xfs_dir2_block_replace);
1293DEFINE_DIR2_TRACE(xfs_dir2_block_removename);
1294DEFINE_DIR2_TRACE(xfs_dir2_block_to_sf);
1295DEFINE_DIR2_TRACE(xfs_dir2_block_to_leaf);
1296DEFINE_DIR2_TRACE(xfs_dir2_leaf_addname);
1297DEFINE_DIR2_TRACE(xfs_dir2_leaf_lookup);
1298DEFINE_DIR2_TRACE(xfs_dir2_leaf_replace);
1299DEFINE_DIR2_TRACE(xfs_dir2_leaf_removename);
1300DEFINE_DIR2_TRACE(xfs_dir2_leaf_to_block);
1301DEFINE_DIR2_TRACE(xfs_dir2_leaf_to_node);
1302DEFINE_DIR2_TRACE(xfs_dir2_node_addname);
1303DEFINE_DIR2_TRACE(xfs_dir2_node_lookup);
1304DEFINE_DIR2_TRACE(xfs_dir2_node_replace);
1305DEFINE_DIR2_TRACE(xfs_dir2_node_removename);
1306DEFINE_DIR2_TRACE(xfs_dir2_node_to_leaf);
1307
1308#define DEFINE_DIR2_SPACE_TRACE(tname) \
1309TRACE_EVENT(tname, \
1310 TP_PROTO(struct xfs_da_args *args, int idx), \ 1409 TP_PROTO(struct xfs_da_args *args, int idx), \
1311 TP_ARGS(args, idx), \ 1410 TP_ARGS(args, idx))
1312 TP_STRUCT__entry( \ 1411DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add);
1313 __field(dev_t, dev) \ 1412DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove);
1314 __field(xfs_ino_t, ino) \ 1413DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode);
1315 __field(int, op_flags) \ 1414DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode);
1316 __field(int, idx) \
1317 ), \
1318 TP_fast_assign( \
1319 __entry->dev = VFS_I(args->dp)->i_sb->s_dev; \
1320 __entry->ino = args->dp->i_ino; \
1321 __entry->op_flags = args->op_flags; \
1322 __entry->idx = idx; \
1323 ), \
1324 TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d", \
1325 MAJOR(__entry->dev), MINOR(__entry->dev), \
1326 __entry->ino, \
1327 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), \
1328 __entry->idx) \
1329)
1330DEFINE_DIR2_SPACE_TRACE(xfs_dir2_leafn_add);
1331DEFINE_DIR2_SPACE_TRACE(xfs_dir2_leafn_remove);
1332DEFINE_DIR2_SPACE_TRACE(xfs_dir2_grow_inode);
1333DEFINE_DIR2_SPACE_TRACE(xfs_dir2_shrink_inode);
1334 1415
1335TRACE_EVENT(xfs_dir2_leafn_moveents, 1416TRACE_EVENT(xfs_dir2_leafn_moveents,
1336 TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count), 1417 TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count),
@@ -1361,6 +1442,59 @@ TRACE_EVENT(xfs_dir2_leafn_moveents,
1361 __entry->count) 1442 __entry->count)
1362); 1443);
1363 1444
1445#define XFS_SWAPEXT_INODES \
1446 { 0, "target" }, \
1447 { 1, "temp" }
1448
1449#define XFS_INODE_FORMAT_STR \
1450 { 0, "invalid" }, \
1451 { 1, "local" }, \
1452 { 2, "extent" }, \
1453 { 3, "btree" }
1454
1455DECLARE_EVENT_CLASS(xfs_swap_extent_class,
1456 TP_PROTO(struct xfs_inode *ip, int which),
1457 TP_ARGS(ip, which),
1458 TP_STRUCT__entry(
1459 __field(dev_t, dev)
1460 __field(int, which)
1461 __field(xfs_ino_t, ino)
1462 __field(int, format)
1463 __field(int, nex)
1464 __field(int, max_nex)
1465 __field(int, broot_size)
1466 __field(int, fork_off)
1467 ),
1468 TP_fast_assign(
1469 __entry->dev = VFS_I(ip)->i_sb->s_dev;
1470 __entry->which = which;
1471 __entry->ino = ip->i_ino;
1472 __entry->format = ip->i_d.di_format;
1473 __entry->nex = ip->i_d.di_nextents;
1474 __entry->max_nex = ip->i_df.if_ext_max;
1475 __entry->broot_size = ip->i_df.if_broot_bytes;
1476 __entry->fork_off = XFS_IFORK_BOFF(ip);
1477 ),
1478 TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
1479 "Max in-fork extents %d, broot size %d, fork offset %d",
1480 MAJOR(__entry->dev), MINOR(__entry->dev),
1481 __entry->ino,
1482 __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
1483 __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
1484 __entry->nex,
1485 __entry->max_nex,
1486 __entry->broot_size,
1487 __entry->fork_off)
1488)
1489
1490#define DEFINE_SWAPEXT_EVENT(name) \
1491DEFINE_EVENT(xfs_swap_extent_class, name, \
1492 TP_PROTO(struct xfs_inode *ip, int which), \
1493 TP_ARGS(ip, which))
1494
1495DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
1496DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
1497
1364#endif /* _TRACE_XFS_H */ 1498#endif /* _TRACE_XFS_H */
1365 1499
1366#undef TRACE_INCLUDE_PATH 1500#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index 0b1878857fc3..fa01b9daba6b 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -45,7 +45,7 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
45 value = NULL; 45 value = NULL;
46 } 46 }
47 47
48 error = -xfs_attr_get(ip, name, value, &asize, xflags); 48 error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
49 if (error) 49 if (error)
50 return error; 50 return error;
51 return asize; 51 return asize;
@@ -67,8 +67,9 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
67 xflags |= ATTR_REPLACE; 67 xflags |= ATTR_REPLACE;
68 68
69 if (!value) 69 if (!value)
70 return -xfs_attr_remove(ip, name, xflags); 70 return -xfs_attr_remove(ip, (unsigned char *)name, xflags);
71 return -xfs_attr_set(ip, name, (void *)value, size, xflags); 71 return -xfs_attr_set(ip, (unsigned char *)name,
72 (void *)value, size, xflags);
72} 73}
73 74
74static struct xattr_handler xfs_xattr_user_handler = { 75static struct xattr_handler xfs_xattr_user_handler = {
@@ -124,8 +125,13 @@ static const char *xfs_xattr_prefix(int flags)
124} 125}
125 126
126static int 127static int
127xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags, 128xfs_xattr_put_listent(
128 char *name, int namelen, int valuelen, char *value) 129 struct xfs_attr_list_context *context,
130 int flags,
131 unsigned char *name,
132 int namelen,
133 int valuelen,
134 unsigned char *value)
129{ 135{
130 unsigned int prefix_len = xfs_xattr_prefix_len(flags); 136 unsigned int prefix_len = xfs_xattr_prefix_len(flags);
131 char *offset; 137 char *offset;
@@ -148,7 +154,7 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
148 offset = (char *)context->alist + context->count; 154 offset = (char *)context->alist + context->count;
149 strncpy(offset, xfs_xattr_prefix(flags), prefix_len); 155 strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
150 offset += prefix_len; 156 offset += prefix_len;
151 strncpy(offset, name, namelen); /* real name */ 157 strncpy(offset, (char *)name, namelen); /* real name */
152 offset += namelen; 158 offset += namelen;
153 *offset = '\0'; 159 *offset = '\0';
154 context->count += prefix_len + namelen + 1; 160 context->count += prefix_len + namelen + 1;
@@ -156,8 +162,13 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
156} 162}
157 163
158static int 164static int
159xfs_xattr_put_listent_sizes(struct xfs_attr_list_context *context, int flags, 165xfs_xattr_put_listent_sizes(
160 char *name, int namelen, int valuelen, char *value) 166 struct xfs_attr_list_context *context,
167 int flags,
168 unsigned char *name,
169 int namelen,
170 int valuelen,
171 unsigned char *value)
161{ 172{
162 context->count += xfs_xattr_prefix_len(flags) + namelen + 1; 173 context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
163 return 0; 174 return 0;
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index d7c7eea09fc2..5f79dd78626b 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1187,7 +1187,7 @@ xfs_qm_dqflush(
1187 * block, nada. 1187 * block, nada.
1188 */ 1188 */
1189 if (!XFS_DQ_IS_DIRTY(dqp) || 1189 if (!XFS_DQ_IS_DIRTY(dqp) ||
1190 (!(flags & XFS_QMOPT_SYNC) && atomic_read(&dqp->q_pincount) > 0)) { 1190 (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
1191 xfs_dqfunlock(dqp); 1191 xfs_dqfunlock(dqp);
1192 return 0; 1192 return 0;
1193 } 1193 }
@@ -1248,23 +1248,20 @@ xfs_qm_dqflush(
1248 */ 1248 */
1249 if (XFS_BUF_ISPINNED(bp)) { 1249 if (XFS_BUF_ISPINNED(bp)) {
1250 trace_xfs_dqflush_force(dqp); 1250 trace_xfs_dqflush_force(dqp);
1251 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 1251 xfs_log_force(mp, 0);
1252 } 1252 }
1253 1253
1254 if (flags & XFS_QMOPT_DELWRI) { 1254 if (flags & SYNC_WAIT)
1255 xfs_bdwrite(mp, bp);
1256 } else if (flags & XFS_QMOPT_ASYNC) {
1257 error = xfs_bawrite(mp, bp);
1258 } else {
1259 error = xfs_bwrite(mp, bp); 1255 error = xfs_bwrite(mp, bp);
1260 } 1256 else
1257 xfs_bdwrite(mp, bp);
1261 1258
1262 trace_xfs_dqflush_done(dqp); 1259 trace_xfs_dqflush_done(dqp);
1263 1260
1264 /* 1261 /*
1265 * dqp is still locked, but caller is free to unlock it now. 1262 * dqp is still locked, but caller is free to unlock it now.
1266 */ 1263 */
1267 return (error); 1264 return error;
1268 1265
1269} 1266}
1270 1267
@@ -1445,7 +1442,7 @@ xfs_qm_dqpurge(
1445 * We don't care about getting disk errors here. We need 1442 * We don't care about getting disk errors here. We need
1446 * to purge this dquot anyway, so we go ahead regardless. 1443 * to purge this dquot anyway, so we go ahead regardless.
1447 */ 1444 */
1448 error = xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC); 1445 error = xfs_qm_dqflush(dqp, SYNC_WAIT);
1449 if (error) 1446 if (error)
1450 xfs_fs_cmn_err(CE_WARN, mp, 1447 xfs_fs_cmn_err(CE_WARN, mp,
1451 "xfs_qm_dqpurge: dquot %p flush failed", dqp); 1448 "xfs_qm_dqpurge: dquot %p flush failed", dqp);
@@ -1529,25 +1526,17 @@ xfs_qm_dqflock_pushbuf_wait(
1529 * the flush lock when the I/O completes. 1526 * the flush lock when the I/O completes.
1530 */ 1527 */
1531 bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno, 1528 bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
1532 XFS_QI_DQCHUNKLEN(dqp->q_mount), 1529 XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK);
1533 XFS_INCORE_TRYLOCK); 1530 if (!bp)
1534 if (bp != NULL) { 1531 goto out_lock;
1535 if (XFS_BUF_ISDELAYWRITE(bp)) { 1532
1536 int error; 1533 if (XFS_BUF_ISDELAYWRITE(bp)) {
1537 if (XFS_BUF_ISPINNED(bp)) { 1534 if (XFS_BUF_ISPINNED(bp))
1538 xfs_log_force(dqp->q_mount, 1535 xfs_log_force(dqp->q_mount, 0);
1539 (xfs_lsn_t)0, 1536 xfs_buf_delwri_promote(bp);
1540 XFS_LOG_FORCE); 1537 wake_up_process(bp->b_target->bt_task);
1541 }
1542 error = xfs_bawrite(dqp->q_mount, bp);
1543 if (error)
1544 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
1545 "xfs_qm_dqflock_pushbuf_wait: "
1546 "pushbuf error %d on dqp %p, bp %p",
1547 error, dqp, bp);
1548 } else {
1549 xfs_buf_relse(bp);
1550 }
1551 } 1538 }
1539 xfs_buf_relse(bp);
1540out_lock:
1552 xfs_dqflock(dqp); 1541 xfs_dqflock(dqp);
1553} 1542}
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index d0d4a9a0bbd7..4e4ee9a57194 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -74,11 +74,11 @@ xfs_qm_dquot_logitem_format(
74 74
75 logvec->i_addr = (xfs_caddr_t)&logitem->qli_format; 75 logvec->i_addr = (xfs_caddr_t)&logitem->qli_format;
76 logvec->i_len = sizeof(xfs_dq_logformat_t); 76 logvec->i_len = sizeof(xfs_dq_logformat_t);
77 XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_QFORMAT); 77 logvec->i_type = XLOG_REG_TYPE_QFORMAT;
78 logvec++; 78 logvec++;
79 logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core; 79 logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core;
80 logvec->i_len = sizeof(xfs_disk_dquot_t); 80 logvec->i_len = sizeof(xfs_disk_dquot_t);
81 XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_DQUOT); 81 logvec->i_type = XLOG_REG_TYPE_DQUOT;
82 82
83 ASSERT(2 == logitem->qli_item.li_desc->lid_size); 83 ASSERT(2 == logitem->qli_item.li_desc->lid_size);
84 logitem->qli_format.qlf_size = 2; 84 logitem->qli_format.qlf_size = 2;
@@ -153,7 +153,7 @@ xfs_qm_dquot_logitem_push(
153 * lock without sleeping, then there must not have been 153 * lock without sleeping, then there must not have been
154 * anyone in the process of flushing the dquot. 154 * anyone in the process of flushing the dquot.
155 */ 155 */
156 error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); 156 error = xfs_qm_dqflush(dqp, 0);
157 if (error) 157 if (error)
158 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 158 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
159 "xfs_qm_dquot_logitem_push: push error %d on dqp %p", 159 "xfs_qm_dquot_logitem_push: push error %d on dqp %p",
@@ -190,7 +190,7 @@ xfs_qm_dqunpin_wait(
190 /* 190 /*
191 * Give the log a push so we don't wait here too long. 191 * Give the log a push so we don't wait here too long.
192 */ 192 */
193 xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE); 193 xfs_log_force(dqp->q_mount, 0);
194 wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); 194 wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
195} 195}
196 196
@@ -212,68 +212,31 @@ xfs_qm_dquot_logitem_pushbuf(
212 xfs_dquot_t *dqp; 212 xfs_dquot_t *dqp;
213 xfs_mount_t *mp; 213 xfs_mount_t *mp;
214 xfs_buf_t *bp; 214 xfs_buf_t *bp;
215 uint dopush;
216 215
217 dqp = qip->qli_dquot; 216 dqp = qip->qli_dquot;
218 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 217 ASSERT(XFS_DQ_IS_LOCKED(dqp));
219 218
220 /* 219 /*
221 * The qli_pushbuf_flag keeps others from
222 * trying to duplicate our effort.
223 */
224 ASSERT(qip->qli_pushbuf_flag != 0);
225 ASSERT(qip->qli_push_owner == current_pid());
226
227 /*
228 * If flushlock isn't locked anymore, chances are that the 220 * If flushlock isn't locked anymore, chances are that the
229 * inode flush completed and the inode was taken off the AIL. 221 * inode flush completed and the inode was taken off the AIL.
230 * So, just get out. 222 * So, just get out.
231 */ 223 */
232 if (completion_done(&dqp->q_flush) || 224 if (completion_done(&dqp->q_flush) ||
233 ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { 225 ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
234 qip->qli_pushbuf_flag = 0;
235 xfs_dqunlock(dqp); 226 xfs_dqunlock(dqp);
236 return; 227 return;
237 } 228 }
238 mp = dqp->q_mount; 229 mp = dqp->q_mount;
239 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, 230 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
240 XFS_QI_DQCHUNKLEN(mp), 231 XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK);
241 XFS_INCORE_TRYLOCK); 232 xfs_dqunlock(dqp);
242 if (bp != NULL) { 233 if (!bp)
243 if (XFS_BUF_ISDELAYWRITE(bp)) {
244 dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
245 !completion_done(&dqp->q_flush));
246 qip->qli_pushbuf_flag = 0;
247 xfs_dqunlock(dqp);
248
249 if (XFS_BUF_ISPINNED(bp)) {
250 xfs_log_force(mp, (xfs_lsn_t)0,
251 XFS_LOG_FORCE);
252 }
253 if (dopush) {
254 int error;
255#ifdef XFSRACEDEBUG
256 delay_for_intr();
257 delay(300);
258#endif
259 error = xfs_bawrite(mp, bp);
260 if (error)
261 xfs_fs_cmn_err(CE_WARN, mp,
262 "xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p",
263 error, qip, bp);
264 } else {
265 xfs_buf_relse(bp);
266 }
267 } else {
268 qip->qli_pushbuf_flag = 0;
269 xfs_dqunlock(dqp);
270 xfs_buf_relse(bp);
271 }
272 return; 234 return;
273 } 235 if (XFS_BUF_ISDELAYWRITE(bp))
236 xfs_buf_delwri_promote(bp);
237 xfs_buf_relse(bp);
238 return;
274 239
275 qip->qli_pushbuf_flag = 0;
276 xfs_dqunlock(dqp);
277} 240}
278 241
279/* 242/*
@@ -291,50 +254,24 @@ xfs_qm_dquot_logitem_trylock(
291 xfs_dq_logitem_t *qip) 254 xfs_dq_logitem_t *qip)
292{ 255{
293 xfs_dquot_t *dqp; 256 xfs_dquot_t *dqp;
294 uint retval;
295 257
296 dqp = qip->qli_dquot; 258 dqp = qip->qli_dquot;
297 if (atomic_read(&dqp->q_pincount) > 0) 259 if (atomic_read(&dqp->q_pincount) > 0)
298 return (XFS_ITEM_PINNED); 260 return XFS_ITEM_PINNED;
299 261
300 if (! xfs_qm_dqlock_nowait(dqp)) 262 if (! xfs_qm_dqlock_nowait(dqp))
301 return (XFS_ITEM_LOCKED); 263 return XFS_ITEM_LOCKED;
302 264
303 retval = XFS_ITEM_SUCCESS;
304 if (!xfs_dqflock_nowait(dqp)) { 265 if (!xfs_dqflock_nowait(dqp)) {
305 /* 266 /*
306 * The dquot is already being flushed. It may have been 267 * dquot has already been flushed to the backing buffer,
307 * flushed delayed write, however, and we don't want to 268 * leave it locked, pushbuf routine will unlock it.
308 * get stuck waiting for that to complete. So, we want to check
309 * to see if we can lock the dquot's buffer without sleeping.
310 * If we can and it is marked for delayed write, then we
311 * hold it and send it out from the push routine. We don't
312 * want to do that now since we might sleep in the device
313 * strategy routine. We also don't want to grab the buffer lock
314 * here because we'd like not to call into the buffer cache
315 * while holding the AIL lock.
316 * Make sure to only return PUSHBUF if we set pushbuf_flag
317 * ourselves. If someone else is doing it then we don't
318 * want to go to the push routine and duplicate their efforts.
319 */ 269 */
320 if (qip->qli_pushbuf_flag == 0) { 270 return XFS_ITEM_PUSHBUF;
321 qip->qli_pushbuf_flag = 1;
322 ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno);
323#ifdef DEBUG
324 qip->qli_push_owner = current_pid();
325#endif
326 /*
327 * The dquot is left locked.
328 */
329 retval = XFS_ITEM_PUSHBUF;
330 } else {
331 retval = XFS_ITEM_FLUSHING;
332 xfs_dqunlock_nonotify(dqp);
333 }
334 } 271 }
335 272
336 ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL); 273 ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
337 return (retval); 274 return XFS_ITEM_SUCCESS;
338} 275}
339 276
340 277
@@ -467,7 +404,7 @@ xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t *qf,
467 404
468 log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format); 405 log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
469 log_vector->i_len = sizeof(xfs_qoff_logitem_t); 406 log_vector->i_len = sizeof(xfs_qoff_logitem_t);
470 XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_QUOTAOFF); 407 log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
471 qf->qql_format.qf_size = 1; 408 qf->qql_format.qf_size = 1;
472} 409}
473 410
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/quota/xfs_dquot_item.h
index 5a632531f843..5acae2ada70b 100644
--- a/fs/xfs/quota/xfs_dquot_item.h
+++ b/fs/xfs/quota/xfs_dquot_item.h
@@ -27,10 +27,6 @@ typedef struct xfs_dq_logitem {
27 xfs_log_item_t qli_item; /* common portion */ 27 xfs_log_item_t qli_item; /* common portion */
28 struct xfs_dquot *qli_dquot; /* dquot ptr */ 28 struct xfs_dquot *qli_dquot; /* dquot ptr */
29 xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ 29 xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
30 unsigned short qli_pushbuf_flag; /* 1 bit used in push_ail */
31#ifdef DEBUG
32 uint64_t qli_push_owner;
33#endif
34 xfs_dq_logformat_t qli_format; /* logged structure */ 30 xfs_dq_logformat_t qli_format; /* logged structure */
35} xfs_dq_logitem_t; 31} xfs_dq_logitem_t;
36 32
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 9e627a8b5b0e..417e61e3d9dd 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -118,9 +118,14 @@ xfs_Gqm_init(void)
118 */ 118 */
119 udqhash = kmem_zalloc_greedy(&hsize, 119 udqhash = kmem_zalloc_greedy(&hsize,
120 XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t), 120 XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
121 XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t), 121 XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t));
122 KM_SLEEP | KM_MAYFAIL | KM_LARGE); 122 if (!udqhash)
123 gdqhash = kmem_zalloc(hsize, KM_SLEEP | KM_LARGE); 123 goto out;
124
125 gdqhash = kmem_zalloc_large(hsize);
126 if (!gdqhash)
127 goto out_free_udqhash;
128
124 hsize /= sizeof(xfs_dqhash_t); 129 hsize /= sizeof(xfs_dqhash_t);
125 ndquot = hsize << 8; 130 ndquot = hsize << 8;
126 131
@@ -170,6 +175,11 @@ xfs_Gqm_init(void)
170 mutex_init(&qcheck_lock); 175 mutex_init(&qcheck_lock);
171#endif 176#endif
172 return xqm; 177 return xqm;
178
179 out_free_udqhash:
180 kmem_free_large(udqhash);
181 out:
182 return NULL;
173} 183}
174 184
175/* 185/*
@@ -189,8 +199,8 @@ xfs_qm_destroy(
189 xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); 199 xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
190 xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i])); 200 xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
191 } 201 }
192 kmem_free(xqm->qm_usr_dqhtable); 202 kmem_free_large(xqm->qm_usr_dqhtable);
193 kmem_free(xqm->qm_grp_dqhtable); 203 kmem_free_large(xqm->qm_grp_dqhtable);
194 xqm->qm_usr_dqhtable = NULL; 204 xqm->qm_usr_dqhtable = NULL;
195 xqm->qm_grp_dqhtable = NULL; 205 xqm->qm_grp_dqhtable = NULL;
196 xqm->qm_dqhashmask = 0; 206 xqm->qm_dqhashmask = 0;
@@ -219,8 +229,12 @@ xfs_qm_hold_quotafs_ref(
219 */ 229 */
220 mutex_lock(&xfs_Gqm_lock); 230 mutex_lock(&xfs_Gqm_lock);
221 231
222 if (xfs_Gqm == NULL) 232 if (!xfs_Gqm) {
223 xfs_Gqm = xfs_Gqm_init(); 233 xfs_Gqm = xfs_Gqm_init();
234 if (!xfs_Gqm)
235 return ENOMEM;
236 }
237
224 /* 238 /*
225 * We can keep a list of all filesystems with quotas mounted for 239 * We can keep a list of all filesystems with quotas mounted for
226 * debugging and statistical purposes, but ... 240 * debugging and statistical purposes, but ...
@@ -436,7 +450,7 @@ xfs_qm_unmount_quotas(
436STATIC int 450STATIC int
437xfs_qm_dqflush_all( 451xfs_qm_dqflush_all(
438 xfs_mount_t *mp, 452 xfs_mount_t *mp,
439 int flags) 453 int sync_mode)
440{ 454{
441 int recl; 455 int recl;
442 xfs_dquot_t *dqp; 456 xfs_dquot_t *dqp;
@@ -472,7 +486,7 @@ again:
472 * across a disk write. 486 * across a disk write.
473 */ 487 */
474 xfs_qm_mplist_unlock(mp); 488 xfs_qm_mplist_unlock(mp);
475 error = xfs_qm_dqflush(dqp, flags); 489 error = xfs_qm_dqflush(dqp, sync_mode);
476 xfs_dqunlock(dqp); 490 xfs_dqunlock(dqp);
477 if (error) 491 if (error)
478 return error; 492 return error;
@@ -912,13 +926,11 @@ xfs_qm_sync(
912{ 926{
913 int recl, restarts; 927 int recl, restarts;
914 xfs_dquot_t *dqp; 928 xfs_dquot_t *dqp;
915 uint flush_flags;
916 int error; 929 int error;
917 930
918 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) 931 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
919 return 0; 932 return 0;
920 933
921 flush_flags = (flags & SYNC_WAIT) ? XFS_QMOPT_SYNC : XFS_QMOPT_DELWRI;
922 restarts = 0; 934 restarts = 0;
923 935
924 again: 936 again:
@@ -978,7 +990,7 @@ xfs_qm_sync(
978 * across a disk write 990 * across a disk write
979 */ 991 */
980 xfs_qm_mplist_unlock(mp); 992 xfs_qm_mplist_unlock(mp);
981 error = xfs_qm_dqflush(dqp, flush_flags); 993 error = xfs_qm_dqflush(dqp, flags);
982 xfs_dqunlock(dqp); 994 xfs_dqunlock(dqp);
983 if (error && XFS_FORCED_SHUTDOWN(mp)) 995 if (error && XFS_FORCED_SHUTDOWN(mp))
984 return 0; /* Need to prevent umount failure */ 996 return 0; /* Need to prevent umount failure */
@@ -1782,7 +1794,7 @@ xfs_qm_quotacheck(
1782 * successfully. 1794 * successfully.
1783 */ 1795 */
1784 if (!error) 1796 if (!error)
1785 error = xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI); 1797 error = xfs_qm_dqflush_all(mp, 0);
1786 1798
1787 /* 1799 /*
1788 * We can get this error if we couldn't do a dquot allocation inside 1800 * We can get this error if we couldn't do a dquot allocation inside
@@ -2004,7 +2016,7 @@ xfs_qm_shake_freelist(
2004 * We flush it delayed write, so don't bother 2016 * We flush it delayed write, so don't bother
2005 * releasing the mplock. 2017 * releasing the mplock.
2006 */ 2018 */
2007 error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); 2019 error = xfs_qm_dqflush(dqp, 0);
2008 if (error) { 2020 if (error) {
2009 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 2021 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
2010 "xfs_qm_dqflush_all: dquot %p flush failed", dqp); 2022 "xfs_qm_dqflush_all: dquot %p flush failed", dqp);
@@ -2187,7 +2199,7 @@ xfs_qm_dqreclaim_one(void)
2187 * We flush it delayed write, so don't bother 2199 * We flush it delayed write, so don't bother
2188 * releasing the freelist lock. 2200 * releasing the freelist lock.
2189 */ 2201 */
2190 error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); 2202 error = xfs_qm_dqflush(dqp, 0);
2191 if (error) { 2203 if (error) {
2192 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 2204 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
2193 "xfs_qm_dqreclaim: dquot %p flush failed", dqp); 2205 "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index a5346630dfae..97b410c12794 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -59,7 +59,7 @@ xfs_fill_statvfs_from_dquot(
59 be64_to_cpu(dp->d_blk_hardlimit); 59 be64_to_cpu(dp->d_blk_hardlimit);
60 if (limit && statp->f_blocks > limit) { 60 if (limit && statp->f_blocks > limit) {
61 statp->f_blocks = limit; 61 statp->f_blocks = limit;
62 statp->f_bfree = 62 statp->f_bfree = statp->f_bavail =
63 (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ? 63 (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
64 (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0; 64 (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
65 } 65 }
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 71af76fe8a23..5d0ee8d492db 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -891,7 +891,7 @@ xfs_qm_dqrele_all_inodes(
891 uint flags) 891 uint flags)
892{ 892{
893 ASSERT(mp->m_quotainfo); 893 ASSERT(mp->m_quotainfo);
894 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG); 894 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
895} 895}
896 896
897/*------------------------------------------------------------------------*/ 897/*------------------------------------------------------------------------*/
@@ -1192,9 +1192,9 @@ xfs_qm_internalqcheck(
1192 if (! XFS_IS_QUOTA_ON(mp)) 1192 if (! XFS_IS_QUOTA_ON(mp))
1193 return XFS_ERROR(ESRCH); 1193 return XFS_ERROR(ESRCH);
1194 1194
1195 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1195 xfs_log_force(mp, XFS_LOG_SYNC);
1196 XFS_bflush(mp->m_ddev_targp); 1196 XFS_bflush(mp->m_ddev_targp);
1197 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1197 xfs_log_force(mp, XFS_LOG_SYNC);
1198 XFS_bflush(mp->m_ddev_targp); 1198 XFS_bflush(mp->m_ddev_targp);
1199 1199
1200 mutex_lock(&qcheck_lock); 1200 mutex_lock(&qcheck_lock);
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 97ac9640be98..c3ab75cb1d9a 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -589,12 +589,18 @@ xfs_trans_unreserve_and_mod_dquots(
589 } 589 }
590} 590}
591 591
592STATIC int 592STATIC void
593xfs_quota_error(uint flags) 593xfs_quota_warn(
594 struct xfs_mount *mp,
595 struct xfs_dquot *dqp,
596 int type)
594{ 597{
595 if (flags & XFS_QMOPT_ENOSPC) 598 /* no warnings for project quotas - we just return ENOSPC later */
596 return ENOSPC; 599 if (dqp->dq_flags & XFS_DQ_PROJ)
597 return EDQUOT; 600 return;
601 quota_send_warning((dqp->dq_flags & XFS_DQ_USER) ? USRQUOTA : GRPQUOTA,
602 be32_to_cpu(dqp->q_core.d_id), mp->m_super->s_dev,
603 type);
598} 604}
599 605
600/* 606/*
@@ -612,7 +618,6 @@ xfs_trans_dqresv(
612 long ninos, 618 long ninos,
613 uint flags) 619 uint flags)
614{ 620{
615 int error;
616 xfs_qcnt_t hardlimit; 621 xfs_qcnt_t hardlimit;
617 xfs_qcnt_t softlimit; 622 xfs_qcnt_t softlimit;
618 time_t timer; 623 time_t timer;
@@ -649,7 +654,6 @@ xfs_trans_dqresv(
649 warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount); 654 warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount);
650 resbcountp = &dqp->q_res_rtbcount; 655 resbcountp = &dqp->q_res_rtbcount;
651 } 656 }
652 error = 0;
653 657
654 if ((flags & XFS_QMOPT_FORCE_RES) == 0 && 658 if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
655 dqp->q_core.d_id && 659 dqp->q_core.d_id &&
@@ -667,18 +671,20 @@ xfs_trans_dqresv(
667 * nblks. 671 * nblks.
668 */ 672 */
669 if (hardlimit > 0ULL && 673 if (hardlimit > 0ULL &&
670 (hardlimit <= nblks + *resbcountp)) { 674 hardlimit <= nblks + *resbcountp) {
671 error = xfs_quota_error(flags); 675 xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
672 goto error_return; 676 goto error_return;
673 } 677 }
674
675 if (softlimit > 0ULL && 678 if (softlimit > 0ULL &&
676 (softlimit <= nblks + *resbcountp)) { 679 softlimit <= nblks + *resbcountp) {
677 if ((timer != 0 && get_seconds() > timer) || 680 if ((timer != 0 && get_seconds() > timer) ||
678 (warns != 0 && warns >= warnlimit)) { 681 (warns != 0 && warns >= warnlimit)) {
679 error = xfs_quota_error(flags); 682 xfs_quota_warn(mp, dqp,
683 QUOTA_NL_BSOFTLONGWARN);
680 goto error_return; 684 goto error_return;
681 } 685 }
686
687 xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN);
682 } 688 }
683 } 689 }
684 if (ninos > 0) { 690 if (ninos > 0) {
@@ -692,15 +698,19 @@ xfs_trans_dqresv(
692 softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); 698 softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
693 if (!softlimit) 699 if (!softlimit)
694 softlimit = q->qi_isoftlimit; 700 softlimit = q->qi_isoftlimit;
701
695 if (hardlimit > 0ULL && count >= hardlimit) { 702 if (hardlimit > 0ULL && count >= hardlimit) {
696 error = xfs_quota_error(flags); 703 xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
697 goto error_return; 704 goto error_return;
698 } else if (softlimit > 0ULL && count >= softlimit) { 705 }
699 if ((timer != 0 && get_seconds() > timer) || 706 if (softlimit > 0ULL && count >= softlimit) {
707 if ((timer != 0 && get_seconds() > timer) ||
700 (warns != 0 && warns >= warnlimit)) { 708 (warns != 0 && warns >= warnlimit)) {
701 error = xfs_quota_error(flags); 709 xfs_quota_warn(mp, dqp,
710 QUOTA_NL_ISOFTLONGWARN);
702 goto error_return; 711 goto error_return;
703 } 712 }
713 xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN);
704 } 714 }
705 } 715 }
706 } 716 }
@@ -736,9 +746,14 @@ xfs_trans_dqresv(
736 ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); 746 ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount));
737 ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); 747 ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
738 748
749 xfs_dqunlock(dqp);
750 return 0;
751
739error_return: 752error_return:
740 xfs_dqunlock(dqp); 753 xfs_dqunlock(dqp);
741 return error; 754 if (flags & XFS_QMOPT_ENOSPC)
755 return ENOSPC;
756 return EDQUOT;
742} 757}
743 758
744 759
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 00fd357c3e46..d13eeba2c8f8 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -36,8 +36,8 @@ struct xfs_acl {
36}; 36};
37 37
38/* On-disk XFS extended attribute names */ 38/* On-disk XFS extended attribute names */
39#define SGI_ACL_FILE "SGI_ACL_FILE" 39#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
40#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT" 40#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
41#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) 41#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
42#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) 42#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
43 43
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 6702bd865811..b1a5a1ff88ea 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -187,17 +187,13 @@ typedef struct xfs_perag_busy {
187/* 187/*
188 * Per-ag incore structure, copies of information in agf and agi, 188 * Per-ag incore structure, copies of information in agf and agi,
189 * to improve the performance of allocation group selection. 189 * to improve the performance of allocation group selection.
190 *
191 * pick sizes which fit in allocation buckets well
192 */ 190 */
193#if (BITS_PER_LONG == 32)
194#define XFS_PAGB_NUM_SLOTS 84
195#elif (BITS_PER_LONG == 64)
196#define XFS_PAGB_NUM_SLOTS 128 191#define XFS_PAGB_NUM_SLOTS 128
197#endif
198 192
199typedef struct xfs_perag 193typedef struct xfs_perag {
200{ 194 struct xfs_mount *pag_mount; /* owner filesystem */
195 xfs_agnumber_t pag_agno; /* AG this structure belongs to */
196 atomic_t pag_ref; /* perag reference count */
201 char pagf_init; /* this agf's entry is initialized */ 197 char pagf_init; /* this agf's entry is initialized */
202 char pagi_init; /* this agi's entry is initialized */ 198 char pagi_init; /* this agi's entry is initialized */
203 char pagf_metadata; /* the agf is preferred to be metadata */ 199 char pagf_metadata; /* the agf is preferred to be metadata */
@@ -210,8 +206,6 @@ typedef struct xfs_perag
210 __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ 206 __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
211 xfs_agino_t pagi_freecount; /* number of free inodes */ 207 xfs_agino_t pagi_freecount; /* number of free inodes */
212 xfs_agino_t pagi_count; /* number of allocated inodes */ 208 xfs_agino_t pagi_count; /* number of allocated inodes */
213 int pagb_count; /* pagb slots in use */
214 xfs_perag_busy_t *pagb_list; /* unstable blocks */
215 209
216 /* 210 /*
217 * Inode allocation search lookup optimisation. 211 * Inode allocation search lookup optimisation.
@@ -230,6 +224,8 @@ typedef struct xfs_perag
230 rwlock_t pag_ici_lock; /* incore inode lock */ 224 rwlock_t pag_ici_lock; /* incore inode lock */
231 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 225 struct radix_tree_root pag_ici_root; /* incore inode cache root */
232#endif 226#endif
227 int pagb_count; /* pagb slots in use */
228 xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
233} xfs_perag_t; 229} xfs_perag_t;
234 230
235/* 231/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index a1c65fc6d9c4..94cddbfb2560 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1662,11 +1662,13 @@ xfs_free_ag_extent(
1662 xfs_agf_t *agf; 1662 xfs_agf_t *agf;
1663 xfs_perag_t *pag; /* per allocation group data */ 1663 xfs_perag_t *pag; /* per allocation group data */
1664 1664
1665 pag = xfs_perag_get(mp, agno);
1666 pag->pagf_freeblks += len;
1667 xfs_perag_put(pag);
1668
1665 agf = XFS_BUF_TO_AGF(agbp); 1669 agf = XFS_BUF_TO_AGF(agbp);
1666 pag = &mp->m_perag[agno];
1667 be32_add_cpu(&agf->agf_freeblks, len); 1670 be32_add_cpu(&agf->agf_freeblks, len);
1668 xfs_trans_agblocks_delta(tp, len); 1671 xfs_trans_agblocks_delta(tp, len);
1669 pag->pagf_freeblks += len;
1670 XFS_WANT_CORRUPTED_GOTO( 1672 XFS_WANT_CORRUPTED_GOTO(
1671 be32_to_cpu(agf->agf_freeblks) <= 1673 be32_to_cpu(agf->agf_freeblks) <=
1672 be32_to_cpu(agf->agf_length), 1674 be32_to_cpu(agf->agf_length),
@@ -1969,10 +1971,12 @@ xfs_alloc_get_freelist(
1969 xfs_trans_brelse(tp, agflbp); 1971 xfs_trans_brelse(tp, agflbp);
1970 if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) 1972 if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
1971 agf->agf_flfirst = 0; 1973 agf->agf_flfirst = 0;
1972 pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; 1974
1975 pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
1973 be32_add_cpu(&agf->agf_flcount, -1); 1976 be32_add_cpu(&agf->agf_flcount, -1);
1974 xfs_trans_agflist_delta(tp, -1); 1977 xfs_trans_agflist_delta(tp, -1);
1975 pag->pagf_flcount--; 1978 pag->pagf_flcount--;
1979 xfs_perag_put(pag);
1976 1980
1977 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; 1981 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
1978 if (btreeblk) { 1982 if (btreeblk) {
@@ -2078,7 +2082,8 @@ xfs_alloc_put_freelist(
2078 be32_add_cpu(&agf->agf_fllast, 1); 2082 be32_add_cpu(&agf->agf_fllast, 1);
2079 if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) 2083 if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
2080 agf->agf_fllast = 0; 2084 agf->agf_fllast = 0;
2081 pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; 2085
2086 pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
2082 be32_add_cpu(&agf->agf_flcount, 1); 2087 be32_add_cpu(&agf->agf_flcount, 1);
2083 xfs_trans_agflist_delta(tp, 1); 2088 xfs_trans_agflist_delta(tp, 1);
2084 pag->pagf_flcount++; 2089 pag->pagf_flcount++;
@@ -2089,6 +2094,7 @@ xfs_alloc_put_freelist(
2089 pag->pagf_btreeblks--; 2094 pag->pagf_btreeblks--;
2090 logflags |= XFS_AGF_BTREEBLKS; 2095 logflags |= XFS_AGF_BTREEBLKS;
2091 } 2096 }
2097 xfs_perag_put(pag);
2092 2098
2093 xfs_alloc_log_agf(tp, agbp, logflags); 2099 xfs_alloc_log_agf(tp, agbp, logflags);
2094 2100
@@ -2152,7 +2158,6 @@ xfs_read_agf(
2152 xfs_trans_brelse(tp, *bpp); 2158 xfs_trans_brelse(tp, *bpp);
2153 return XFS_ERROR(EFSCORRUPTED); 2159 return XFS_ERROR(EFSCORRUPTED);
2154 } 2160 }
2155
2156 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF); 2161 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
2157 return 0; 2162 return 0;
2158} 2163}
@@ -2175,7 +2180,7 @@ xfs_alloc_read_agf(
2175 ASSERT(agno != NULLAGNUMBER); 2180 ASSERT(agno != NULLAGNUMBER);
2176 2181
2177 error = xfs_read_agf(mp, tp, agno, 2182 error = xfs_read_agf(mp, tp, agno,
2178 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0, 2183 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
2179 bpp); 2184 bpp);
2180 if (error) 2185 if (error)
2181 return error; 2186 return error;
@@ -2184,7 +2189,7 @@ xfs_alloc_read_agf(
2184 ASSERT(!XFS_BUF_GETERROR(*bpp)); 2189 ASSERT(!XFS_BUF_GETERROR(*bpp));
2185 2190
2186 agf = XFS_BUF_TO_AGF(*bpp); 2191 agf = XFS_BUF_TO_AGF(*bpp);
2187 pag = &mp->m_perag[agno]; 2192 pag = xfs_perag_get(mp, agno);
2188 if (!pag->pagf_init) { 2193 if (!pag->pagf_init) {
2189 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); 2194 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
2190 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); 2195 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
@@ -2195,8 +2200,8 @@ xfs_alloc_read_agf(
2195 pag->pagf_levels[XFS_BTNUM_CNTi] = 2200 pag->pagf_levels[XFS_BTNUM_CNTi] =
2196 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); 2201 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
2197 spin_lock_init(&pag->pagb_lock); 2202 spin_lock_init(&pag->pagb_lock);
2198 pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS * 2203 pag->pagb_count = 0;
2199 sizeof(xfs_perag_busy_t), KM_SLEEP); 2204 memset(pag->pagb_list, 0, sizeof(pag->pagb_list));
2200 pag->pagf_init = 1; 2205 pag->pagf_init = 1;
2201 } 2206 }
2202#ifdef DEBUG 2207#ifdef DEBUG
@@ -2211,6 +2216,7 @@ xfs_alloc_read_agf(
2211 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); 2216 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
2212 } 2217 }
2213#endif 2218#endif
2219 xfs_perag_put(pag);
2214 return 0; 2220 return 0;
2215} 2221}
2216 2222
@@ -2270,8 +2276,7 @@ xfs_alloc_vextent(
2270 * These three force us into a single a.g. 2276 * These three force us into a single a.g.
2271 */ 2277 */
2272 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); 2278 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
2273 down_read(&mp->m_peraglock); 2279 args->pag = xfs_perag_get(mp, args->agno);
2274 args->pag = &mp->m_perag[args->agno];
2275 args->minleft = 0; 2280 args->minleft = 0;
2276 error = xfs_alloc_fix_freelist(args, 0); 2281 error = xfs_alloc_fix_freelist(args, 0);
2277 args->minleft = minleft; 2282 args->minleft = minleft;
@@ -2280,14 +2285,12 @@ xfs_alloc_vextent(
2280 goto error0; 2285 goto error0;
2281 } 2286 }
2282 if (!args->agbp) { 2287 if (!args->agbp) {
2283 up_read(&mp->m_peraglock);
2284 trace_xfs_alloc_vextent_noagbp(args); 2288 trace_xfs_alloc_vextent_noagbp(args);
2285 break; 2289 break;
2286 } 2290 }
2287 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); 2291 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
2288 if ((error = xfs_alloc_ag_vextent(args))) 2292 if ((error = xfs_alloc_ag_vextent(args)))
2289 goto error0; 2293 goto error0;
2290 up_read(&mp->m_peraglock);
2291 break; 2294 break;
2292 case XFS_ALLOCTYPE_START_BNO: 2295 case XFS_ALLOCTYPE_START_BNO:
2293 /* 2296 /*
@@ -2339,9 +2342,8 @@ xfs_alloc_vextent(
2339 * Loop over allocation groups twice; first time with 2342 * Loop over allocation groups twice; first time with
2340 * trylock set, second time without. 2343 * trylock set, second time without.
2341 */ 2344 */
2342 down_read(&mp->m_peraglock);
2343 for (;;) { 2345 for (;;) {
2344 args->pag = &mp->m_perag[args->agno]; 2346 args->pag = xfs_perag_get(mp, args->agno);
2345 if (no_min) args->minleft = 0; 2347 if (no_min) args->minleft = 0;
2346 error = xfs_alloc_fix_freelist(args, flags); 2348 error = xfs_alloc_fix_freelist(args, flags);
2347 args->minleft = minleft; 2349 args->minleft = minleft;
@@ -2400,8 +2402,8 @@ xfs_alloc_vextent(
2400 } 2402 }
2401 } 2403 }
2402 } 2404 }
2405 xfs_perag_put(args->pag);
2403 } 2406 }
2404 up_read(&mp->m_peraglock);
2405 if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { 2407 if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
2406 if (args->agno == sagno) 2408 if (args->agno == sagno)
2407 mp->m_agfrotor = (mp->m_agfrotor + 1) % 2409 mp->m_agfrotor = (mp->m_agfrotor + 1) %
@@ -2427,9 +2429,10 @@ xfs_alloc_vextent(
2427 args->len); 2429 args->len);
2428#endif 2430#endif
2429 } 2431 }
2432 xfs_perag_put(args->pag);
2430 return 0; 2433 return 0;
2431error0: 2434error0:
2432 up_read(&mp->m_peraglock); 2435 xfs_perag_put(args->pag);
2433 return error; 2436 return error;
2434} 2437}
2435 2438
@@ -2454,8 +2457,7 @@ xfs_free_extent(
2454 args.agno = XFS_FSB_TO_AGNO(args.mp, bno); 2457 args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
2455 ASSERT(args.agno < args.mp->m_sb.sb_agcount); 2458 ASSERT(args.agno < args.mp->m_sb.sb_agcount);
2456 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); 2459 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
2457 down_read(&args.mp->m_peraglock); 2460 args.pag = xfs_perag_get(args.mp, args.agno);
2458 args.pag = &args.mp->m_perag[args.agno];
2459 if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) 2461 if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
2460 goto error0; 2462 goto error0;
2461#ifdef DEBUG 2463#ifdef DEBUG
@@ -2465,7 +2467,7 @@ xfs_free_extent(
2465#endif 2467#endif
2466 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); 2468 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
2467error0: 2469error0:
2468 up_read(&args.mp->m_peraglock); 2470 xfs_perag_put(args.pag);
2469 return error; 2471 return error;
2470} 2472}
2471 2473
@@ -2486,15 +2488,15 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
2486 xfs_agblock_t bno, 2488 xfs_agblock_t bno,
2487 xfs_extlen_t len) 2489 xfs_extlen_t len)
2488{ 2490{
2489 xfs_mount_t *mp;
2490 xfs_perag_busy_t *bsy; 2491 xfs_perag_busy_t *bsy;
2492 struct xfs_perag *pag;
2491 int n; 2493 int n;
2492 2494
2493 mp = tp->t_mountp; 2495 pag = xfs_perag_get(tp->t_mountp, agno);
2494 spin_lock(&mp->m_perag[agno].pagb_lock); 2496 spin_lock(&pag->pagb_lock);
2495 2497
2496 /* search pagb_list for an open slot */ 2498 /* search pagb_list for an open slot */
2497 for (bsy = mp->m_perag[agno].pagb_list, n = 0; 2499 for (bsy = pag->pagb_list, n = 0;
2498 n < XFS_PAGB_NUM_SLOTS; 2500 n < XFS_PAGB_NUM_SLOTS;
2499 bsy++, n++) { 2501 bsy++, n++) {
2500 if (bsy->busy_tp == NULL) { 2502 if (bsy->busy_tp == NULL) {
@@ -2502,11 +2504,11 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
2502 } 2504 }
2503 } 2505 }
2504 2506
2505 trace_xfs_alloc_busy(mp, agno, bno, len, n); 2507 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n);
2506 2508
2507 if (n < XFS_PAGB_NUM_SLOTS) { 2509 if (n < XFS_PAGB_NUM_SLOTS) {
2508 bsy = &mp->m_perag[agno].pagb_list[n]; 2510 bsy = &pag->pagb_list[n];
2509 mp->m_perag[agno].pagb_count++; 2511 pag->pagb_count++;
2510 bsy->busy_start = bno; 2512 bsy->busy_start = bno;
2511 bsy->busy_length = len; 2513 bsy->busy_length = len;
2512 bsy->busy_tp = tp; 2514 bsy->busy_tp = tp;
@@ -2521,7 +2523,8 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
2521 xfs_trans_set_sync(tp); 2523 xfs_trans_set_sync(tp);
2522 } 2524 }
2523 2525
2524 spin_unlock(&mp->m_perag[agno].pagb_lock); 2526 spin_unlock(&pag->pagb_lock);
2527 xfs_perag_put(pag);
2525} 2528}
2526 2529
2527void 2530void
@@ -2529,24 +2532,23 @@ xfs_alloc_clear_busy(xfs_trans_t *tp,
2529 xfs_agnumber_t agno, 2532 xfs_agnumber_t agno,
2530 int idx) 2533 int idx)
2531{ 2534{
2532 xfs_mount_t *mp; 2535 struct xfs_perag *pag;
2533 xfs_perag_busy_t *list; 2536 xfs_perag_busy_t *list;
2534 2537
2535 mp = tp->t_mountp;
2536
2537 spin_lock(&mp->m_perag[agno].pagb_lock);
2538 list = mp->m_perag[agno].pagb_list;
2539
2540 ASSERT(idx < XFS_PAGB_NUM_SLOTS); 2538 ASSERT(idx < XFS_PAGB_NUM_SLOTS);
2539 pag = xfs_perag_get(tp->t_mountp, agno);
2540 spin_lock(&pag->pagb_lock);
2541 list = pag->pagb_list;
2541 2542
2542 trace_xfs_alloc_unbusy(mp, agno, idx, list[idx].busy_tp == tp); 2543 trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp);
2543 2544
2544 if (list[idx].busy_tp == tp) { 2545 if (list[idx].busy_tp == tp) {
2545 list[idx].busy_tp = NULL; 2546 list[idx].busy_tp = NULL;
2546 mp->m_perag[agno].pagb_count--; 2547 pag->pagb_count--;
2547 } 2548 }
2548 2549
2549 spin_unlock(&mp->m_perag[agno].pagb_lock); 2550 spin_unlock(&pag->pagb_lock);
2551 xfs_perag_put(pag);
2550} 2552}
2551 2553
2552 2554
@@ -2560,46 +2562,44 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
2560 xfs_agblock_t bno, 2562 xfs_agblock_t bno,
2561 xfs_extlen_t len) 2563 xfs_extlen_t len)
2562{ 2564{
2563 xfs_mount_t *mp; 2565 struct xfs_perag *pag;
2564 xfs_perag_busy_t *bsy; 2566 xfs_perag_busy_t *bsy;
2565 xfs_agblock_t uend, bend; 2567 xfs_agblock_t uend, bend;
2566 xfs_lsn_t lsn; 2568 xfs_lsn_t lsn = 0;
2567 int cnt; 2569 int cnt;
2568 2570
2569 mp = tp->t_mountp; 2571 pag = xfs_perag_get(tp->t_mountp, agno);
2570 2572 spin_lock(&pag->pagb_lock);
2571 spin_lock(&mp->m_perag[agno].pagb_lock); 2573 cnt = pag->pagb_count;
2572 cnt = mp->m_perag[agno].pagb_count;
2573 2574
2575 /*
2576 * search pagb_list for this slot, skipping open slots. We have to
2577 * search the entire array as there may be multiple overlaps and
2578 * we have to get the most recent LSN for the log force to push out
2579 * all the transactions that span the range.
2580 */
2574 uend = bno + len - 1; 2581 uend = bno + len - 1;
2575 2582 for (cnt = 0; cnt < pag->pagb_count; cnt++) {
2576 /* search pagb_list for this slot, skipping open slots */ 2583 bsy = &pag->pagb_list[cnt];
2577 for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) { 2584 if (!bsy->busy_tp)
2578 2585 continue;
2579 /* 2586
2580 * (start1,length1) within (start2, length2) 2587 bend = bsy->busy_start + bsy->busy_length - 1;
2581 */ 2588 if (bno > bend || uend < bsy->busy_start)
2582 if (bsy->busy_tp != NULL) { 2589 continue;
2583 bend = bsy->busy_start + bsy->busy_length - 1; 2590
2584 if ((bno > bend) || (uend < bsy->busy_start)) { 2591 /* (start1,length1) within (start2, length2) */
2585 cnt--; 2592 if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
2586 } else { 2593 lsn = bsy->busy_tp->t_commit_lsn;
2587 break;
2588 }
2589 }
2590 } 2594 }
2591 2595 spin_unlock(&pag->pagb_lock);
2592 trace_xfs_alloc_busysearch(mp, agno, bno, len, !!cnt); 2596 xfs_perag_put(pag);
2597 trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
2593 2598
2594 /* 2599 /*
2595 * If a block was found, force the log through the LSN of the 2600 * If a block was found, force the log through the LSN of the
2596 * transaction that freed the block 2601 * transaction that freed the block
2597 */ 2602 */
2598 if (cnt) { 2603 if (lsn)
2599 lsn = bsy->busy_tp->t_commit_lsn; 2604 xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
2600 spin_unlock(&mp->m_perag[agno].pagb_lock);
2601 xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
2602 } else {
2603 spin_unlock(&mp->m_perag[agno].pagb_lock);
2604 }
2605} 2605}
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index adbd9141aea1..b726e10d2c1c 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -61,12 +61,14 @@ xfs_allocbt_set_root(
61 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 61 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
62 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); 62 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
63 int btnum = cur->bc_btnum; 63 int btnum = cur->bc_btnum;
64 struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
64 65
65 ASSERT(ptr->s != 0); 66 ASSERT(ptr->s != 0);
66 67
67 agf->agf_roots[btnum] = ptr->s; 68 agf->agf_roots[btnum] = ptr->s;
68 be32_add_cpu(&agf->agf_levels[btnum], inc); 69 be32_add_cpu(&agf->agf_levels[btnum], inc);
69 cur->bc_mp->m_perag[seqno].pagf_levels[btnum] += inc; 70 pag->pagf_levels[btnum] += inc;
71 xfs_perag_put(pag);
70 72
71 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); 73 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
72} 74}
@@ -150,6 +152,7 @@ xfs_allocbt_update_lastrec(
150{ 152{
151 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); 153 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
152 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); 154 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
155 struct xfs_perag *pag;
153 __be32 len; 156 __be32 len;
154 int numrecs; 157 int numrecs;
155 158
@@ -193,7 +196,9 @@ xfs_allocbt_update_lastrec(
193 } 196 }
194 197
195 agf->agf_longest = len; 198 agf->agf_longest = len;
196 cur->bc_mp->m_perag[seqno].pagf_longest = be32_to_cpu(len); 199 pag = xfs_perag_get(cur->bc_mp, seqno);
200 pag->pagf_longest = be32_to_cpu(len);
201 xfs_perag_put(pag);
197 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST); 202 xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
198} 203}
199 204
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index e953b6cfb2a8..b9c196a53c42 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -93,12 +93,12 @@ STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
93STATIC int 93STATIC int
94xfs_attr_name_to_xname( 94xfs_attr_name_to_xname(
95 struct xfs_name *xname, 95 struct xfs_name *xname,
96 const char *aname) 96 const unsigned char *aname)
97{ 97{
98 if (!aname) 98 if (!aname)
99 return EINVAL; 99 return EINVAL;
100 xname->name = aname; 100 xname->name = aname;
101 xname->len = strlen(aname); 101 xname->len = strlen((char *)aname);
102 if (xname->len >= MAXNAMELEN) 102 if (xname->len >= MAXNAMELEN)
103 return EFAULT; /* match IRIX behaviour */ 103 return EFAULT; /* match IRIX behaviour */
104 104
@@ -124,7 +124,7 @@ STATIC int
124xfs_attr_get_int( 124xfs_attr_get_int(
125 struct xfs_inode *ip, 125 struct xfs_inode *ip,
126 struct xfs_name *name, 126 struct xfs_name *name,
127 char *value, 127 unsigned char *value,
128 int *valuelenp, 128 int *valuelenp,
129 int flags) 129 int flags)
130{ 130{
@@ -171,8 +171,8 @@ xfs_attr_get_int(
171int 171int
172xfs_attr_get( 172xfs_attr_get(
173 xfs_inode_t *ip, 173 xfs_inode_t *ip,
174 const char *name, 174 const unsigned char *name,
175 char *value, 175 unsigned char *value,
176 int *valuelenp, 176 int *valuelenp,
177 int flags) 177 int flags)
178{ 178{
@@ -197,7 +197,7 @@ xfs_attr_get(
197/* 197/*
198 * Calculate how many blocks we need for the new attribute, 198 * Calculate how many blocks we need for the new attribute,
199 */ 199 */
200int 200STATIC int
201xfs_attr_calc_size( 201xfs_attr_calc_size(
202 struct xfs_inode *ip, 202 struct xfs_inode *ip,
203 int namelen, 203 int namelen,
@@ -235,8 +235,12 @@ xfs_attr_calc_size(
235} 235}
236 236
237STATIC int 237STATIC int
238xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name, 238xfs_attr_set_int(
239 char *value, int valuelen, int flags) 239 struct xfs_inode *dp,
240 struct xfs_name *name,
241 unsigned char *value,
242 int valuelen,
243 int flags)
240{ 244{
241 xfs_da_args_t args; 245 xfs_da_args_t args;
242 xfs_fsblock_t firstblock; 246 xfs_fsblock_t firstblock;
@@ -452,8 +456,8 @@ out:
452int 456int
453xfs_attr_set( 457xfs_attr_set(
454 xfs_inode_t *dp, 458 xfs_inode_t *dp,
455 const char *name, 459 const unsigned char *name,
456 char *value, 460 unsigned char *value,
457 int valuelen, 461 int valuelen,
458 int flags) 462 int flags)
459{ 463{
@@ -600,7 +604,7 @@ out:
600int 604int
601xfs_attr_remove( 605xfs_attr_remove(
602 xfs_inode_t *dp, 606 xfs_inode_t *dp,
603 const char *name, 607 const unsigned char *name,
604 int flags) 608 int flags)
605{ 609{
606 int error; 610 int error;
@@ -669,9 +673,13 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
669 */ 673 */
670/*ARGSUSED*/ 674/*ARGSUSED*/
671STATIC int 675STATIC int
672xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags, 676xfs_attr_put_listent(
673 char *name, int namelen, 677 xfs_attr_list_context_t *context,
674 int valuelen, char *value) 678 int flags,
679 unsigned char *name,
680 int namelen,
681 int valuelen,
682 unsigned char *value)
675{ 683{
676 struct attrlist *alist = (struct attrlist *)context->alist; 684 struct attrlist *alist = (struct attrlist *)context->alist;
677 attrlist_ent_t *aep; 685 attrlist_ent_t *aep;
@@ -1980,7 +1988,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1980 xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE]; 1988 xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
1981 xfs_mount_t *mp; 1989 xfs_mount_t *mp;
1982 xfs_daddr_t dblkno; 1990 xfs_daddr_t dblkno;
1983 xfs_caddr_t dst; 1991 void *dst;
1984 xfs_buf_t *bp; 1992 xfs_buf_t *bp;
1985 int nmap, error, tmp, valuelen, blkcnt, i; 1993 int nmap, error, tmp, valuelen, blkcnt, i;
1986 xfs_dablk_t lblkno; 1994 xfs_dablk_t lblkno;
@@ -2007,15 +2015,14 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
2007 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); 2015 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
2008 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); 2016 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
2009 error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, 2017 error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
2010 blkcnt, 2018 blkcnt, XBF_LOCK | XBF_DONT_BLOCK,
2011 XFS_BUF_LOCK | XBF_DONT_BLOCK,
2012 &bp); 2019 &bp);
2013 if (error) 2020 if (error)
2014 return(error); 2021 return(error);
2015 2022
2016 tmp = (valuelen < XFS_BUF_SIZE(bp)) 2023 tmp = (valuelen < XFS_BUF_SIZE(bp))
2017 ? valuelen : XFS_BUF_SIZE(bp); 2024 ? valuelen : XFS_BUF_SIZE(bp);
2018 xfs_biomove(bp, 0, tmp, dst, XFS_B_READ); 2025 xfs_biomove(bp, 0, tmp, dst, XBF_READ);
2019 xfs_buf_relse(bp); 2026 xfs_buf_relse(bp);
2020 dst += tmp; 2027 dst += tmp;
2021 valuelen -= tmp; 2028 valuelen -= tmp;
@@ -2039,7 +2046,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2039 xfs_inode_t *dp; 2046 xfs_inode_t *dp;
2040 xfs_bmbt_irec_t map; 2047 xfs_bmbt_irec_t map;
2041 xfs_daddr_t dblkno; 2048 xfs_daddr_t dblkno;
2042 xfs_caddr_t src; 2049 void *src;
2043 xfs_buf_t *bp; 2050 xfs_buf_t *bp;
2044 xfs_dablk_t lblkno; 2051 xfs_dablk_t lblkno;
2045 int blkcnt, valuelen, nmap, error, tmp, committed; 2052 int blkcnt, valuelen, nmap, error, tmp, committed;
@@ -2141,13 +2148,13 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2141 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); 2148 blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
2142 2149
2143 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 2150 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
2144 XFS_BUF_LOCK | XBF_DONT_BLOCK); 2151 XBF_LOCK | XBF_DONT_BLOCK);
2145 ASSERT(bp); 2152 ASSERT(bp);
2146 ASSERT(!XFS_BUF_GETERROR(bp)); 2153 ASSERT(!XFS_BUF_GETERROR(bp));
2147 2154
2148 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : 2155 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
2149 XFS_BUF_SIZE(bp); 2156 XFS_BUF_SIZE(bp);
2150 xfs_biomove(bp, 0, tmp, src, XFS_B_WRITE); 2157 xfs_biomove(bp, 0, tmp, src, XBF_WRITE);
2151 if (tmp < XFS_BUF_SIZE(bp)) 2158 if (tmp < XFS_BUF_SIZE(bp))
2152 xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); 2159 xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
2153 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ 2160 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
@@ -2208,8 +2215,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2208 /* 2215 /*
2209 * If the "remote" value is in the cache, remove it. 2216 * If the "remote" value is in the cache, remove it.
2210 */ 2217 */
2211 bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, 2218 bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
2212 XFS_INCORE_TRYLOCK);
2213 if (bp) { 2219 if (bp) {
2214 XFS_BUF_STALE(bp); 2220 XFS_BUF_STALE(bp);
2215 XFS_BUF_UNDELAYWRITE(bp); 2221 XFS_BUF_UNDELAYWRITE(bp);
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 59b410ce69a1..e920d68ef509 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -113,7 +113,7 @@ typedef struct attrlist_cursor_kern {
113 113
114 114
115typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, 115typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
116 char *, int, int, char *); 116 unsigned char *, int, int, unsigned char *);
117 117
118typedef struct xfs_attr_list_context { 118typedef struct xfs_attr_list_context {
119 struct xfs_inode *dp; /* inode */ 119 struct xfs_inode *dp; /* inode */
@@ -139,7 +139,6 @@ typedef struct xfs_attr_list_context {
139/* 139/*
140 * Overall external interface routines. 140 * Overall external interface routines.
141 */ 141 */
142int xfs_attr_calc_size(struct xfs_inode *, int, int, int *);
143int xfs_attr_inactive(struct xfs_inode *dp); 142int xfs_attr_inactive(struct xfs_inode *dp);
144int xfs_attr_rmtval_get(struct xfs_da_args *args); 143int xfs_attr_rmtval_get(struct xfs_da_args *args);
145int xfs_attr_list_int(struct xfs_attr_list_context *); 144int xfs_attr_list_int(struct xfs_attr_list_context *);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index baf41b5af756..a90ce74fc256 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -521,11 +521,11 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
521 521
522 sfe = &sf->list[0]; 522 sfe = &sf->list[0];
523 for (i = 0; i < sf->hdr.count; i++) { 523 for (i = 0; i < sf->hdr.count; i++) {
524 nargs.name = (char *)sfe->nameval; 524 nargs.name = sfe->nameval;
525 nargs.namelen = sfe->namelen; 525 nargs.namelen = sfe->namelen;
526 nargs.value = (char *)&sfe->nameval[nargs.namelen]; 526 nargs.value = &sfe->nameval[nargs.namelen];
527 nargs.valuelen = sfe->valuelen; 527 nargs.valuelen = sfe->valuelen;
528 nargs.hashval = xfs_da_hashname((char *)sfe->nameval, 528 nargs.hashval = xfs_da_hashname(sfe->nameval,
529 sfe->namelen); 529 sfe->namelen);
530 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); 530 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
531 error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */ 531 error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
@@ -612,10 +612,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
612 for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { 612 for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
613 error = context->put_listent(context, 613 error = context->put_listent(context,
614 sfe->flags, 614 sfe->flags,
615 (char *)sfe->nameval, 615 sfe->nameval,
616 (int)sfe->namelen, 616 (int)sfe->namelen,
617 (int)sfe->valuelen, 617 (int)sfe->valuelen,
618 (char*)&sfe->nameval[sfe->namelen]); 618 &sfe->nameval[sfe->namelen]);
619 619
620 /* 620 /*
621 * Either search callback finished early or 621 * Either search callback finished early or
@@ -659,8 +659,8 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
659 } 659 }
660 660
661 sbp->entno = i; 661 sbp->entno = i;
662 sbp->hash = xfs_da_hashname((char *)sfe->nameval, sfe->namelen); 662 sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
663 sbp->name = (char *)sfe->nameval; 663 sbp->name = sfe->nameval;
664 sbp->namelen = sfe->namelen; 664 sbp->namelen = sfe->namelen;
665 /* These are bytes, and both on-disk, don't endian-flip */ 665 /* These are bytes, and both on-disk, don't endian-flip */
666 sbp->valuelen = sfe->valuelen; 666 sbp->valuelen = sfe->valuelen;
@@ -818,9 +818,9 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
818 continue; 818 continue;
819 ASSERT(entry->flags & XFS_ATTR_LOCAL); 819 ASSERT(entry->flags & XFS_ATTR_LOCAL);
820 name_loc = xfs_attr_leaf_name_local(leaf, i); 820 name_loc = xfs_attr_leaf_name_local(leaf, i);
821 nargs.name = (char *)name_loc->nameval; 821 nargs.name = name_loc->nameval;
822 nargs.namelen = name_loc->namelen; 822 nargs.namelen = name_loc->namelen;
823 nargs.value = (char *)&name_loc->nameval[nargs.namelen]; 823 nargs.value = &name_loc->nameval[nargs.namelen];
824 nargs.valuelen = be16_to_cpu(name_loc->valuelen); 824 nargs.valuelen = be16_to_cpu(name_loc->valuelen);
825 nargs.hashval = be32_to_cpu(entry->hashval); 825 nargs.hashval = be32_to_cpu(entry->hashval);
826 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags); 826 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
@@ -2370,10 +2370,10 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2370 2370
2371 retval = context->put_listent(context, 2371 retval = context->put_listent(context,
2372 entry->flags, 2372 entry->flags,
2373 (char *)name_loc->nameval, 2373 name_loc->nameval,
2374 (int)name_loc->namelen, 2374 (int)name_loc->namelen,
2375 be16_to_cpu(name_loc->valuelen), 2375 be16_to_cpu(name_loc->valuelen),
2376 (char *)&name_loc->nameval[name_loc->namelen]); 2376 &name_loc->nameval[name_loc->namelen]);
2377 if (retval) 2377 if (retval)
2378 return retval; 2378 return retval;
2379 } else { 2379 } else {
@@ -2397,15 +2397,15 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2397 return retval; 2397 return retval;
2398 retval = context->put_listent(context, 2398 retval = context->put_listent(context,
2399 entry->flags, 2399 entry->flags,
2400 (char *)name_rmt->name, 2400 name_rmt->name,
2401 (int)name_rmt->namelen, 2401 (int)name_rmt->namelen,
2402 valuelen, 2402 valuelen,
2403 (char*)args.value); 2403 args.value);
2404 kmem_free(args.value); 2404 kmem_free(args.value);
2405 } else { 2405 } else {
2406 retval = context->put_listent(context, 2406 retval = context->put_listent(context,
2407 entry->flags, 2407 entry->flags,
2408 (char *)name_rmt->name, 2408 name_rmt->name,
2409 (int)name_rmt->namelen, 2409 (int)name_rmt->namelen,
2410 valuelen, 2410 valuelen,
2411 NULL); 2411 NULL);
@@ -2950,7 +2950,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
2950 map.br_blockcount); 2950 map.br_blockcount);
2951 bp = xfs_trans_get_buf(*trans, 2951 bp = xfs_trans_get_buf(*trans,
2952 dp->i_mount->m_ddev_targp, 2952 dp->i_mount->m_ddev_targp,
2953 dblkno, dblkcnt, XFS_BUF_LOCK); 2953 dblkno, dblkcnt, XBF_LOCK);
2954 xfs_trans_binval(*trans, bp); 2954 xfs_trans_binval(*trans, bp);
2955 /* 2955 /*
2956 * Roll to next transaction. 2956 * Roll to next transaction.
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
index 76ab7b0cbb3a..919756e3ba53 100644
--- a/fs/xfs/xfs_attr_sf.h
+++ b/fs/xfs/xfs_attr_sf.h
@@ -52,7 +52,7 @@ typedef struct xfs_attr_sf_sort {
52 __uint8_t valuelen; /* length of value */ 52 __uint8_t valuelen; /* length of value */
53 __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ 53 __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
54 xfs_dahash_t hash; /* this entry's hash value */ 54 xfs_dahash_t hash; /* this entry's hash value */
55 char *name; /* name value, pointer into buffer */ 55 unsigned char *name; /* name value, pointer into buffer */
56} xfs_attr_sf_sort_t; 56} xfs_attr_sf_sort_t;
57 57
58#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ 58#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 98251cdc52aa..5c11e4d17010 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2550,22 +2550,134 @@ xfs_bmap_rtalloc(
2550} 2550}
2551 2551
2552STATIC int 2552STATIC int
2553xfs_bmap_btalloc_nullfb(
2554 struct xfs_bmalloca *ap,
2555 struct xfs_alloc_arg *args,
2556 xfs_extlen_t *blen)
2557{
2558 struct xfs_mount *mp = ap->ip->i_mount;
2559 struct xfs_perag *pag;
2560 xfs_agnumber_t ag, startag;
2561 int notinit = 0;
2562 int error;
2563
2564 if (ap->userdata && xfs_inode_is_filestream(ap->ip))
2565 args->type = XFS_ALLOCTYPE_NEAR_BNO;
2566 else
2567 args->type = XFS_ALLOCTYPE_START_BNO;
2568 args->total = ap->total;
2569
2570 /*
2571 * Search for an allocation group with a single extent large enough
2572 * for the request. If one isn't found, then adjust the minimum
2573 * allocation size to the largest space found.
2574 */
2575 startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
2576 if (startag == NULLAGNUMBER)
2577 startag = ag = 0;
2578
2579 pag = xfs_perag_get(mp, ag);
2580 while (*blen < ap->alen) {
2581 if (!pag->pagf_init) {
2582 error = xfs_alloc_pagf_init(mp, args->tp, ag,
2583 XFS_ALLOC_FLAG_TRYLOCK);
2584 if (error) {
2585 xfs_perag_put(pag);
2586 return error;
2587 }
2588 }
2589
2590 /*
2591 * See xfs_alloc_fix_freelist...
2592 */
2593 if (pag->pagf_init) {
2594 xfs_extlen_t longest;
2595 longest = xfs_alloc_longest_free_extent(mp, pag);
2596 if (*blen < longest)
2597 *blen = longest;
2598 } else
2599 notinit = 1;
2600
2601 if (xfs_inode_is_filestream(ap->ip)) {
2602 if (*blen >= ap->alen)
2603 break;
2604
2605 if (ap->userdata) {
2606 /*
2607 * If startag is an invalid AG, we've
2608 * come here once before and
2609 * xfs_filestream_new_ag picked the
2610 * best currently available.
2611 *
2612 * Don't continue looping, since we
2613 * could loop forever.
2614 */
2615 if (startag == NULLAGNUMBER)
2616 break;
2617
2618 error = xfs_filestream_new_ag(ap, &ag);
2619 xfs_perag_put(pag);
2620 if (error)
2621 return error;
2622
2623 /* loop again to set 'blen'*/
2624 startag = NULLAGNUMBER;
2625 pag = xfs_perag_get(mp, ag);
2626 continue;
2627 }
2628 }
2629 if (++ag == mp->m_sb.sb_agcount)
2630 ag = 0;
2631 if (ag == startag)
2632 break;
2633 xfs_perag_put(pag);
2634 pag = xfs_perag_get(mp, ag);
2635 }
2636 xfs_perag_put(pag);
2637
2638 /*
2639 * Since the above loop did a BUF_TRYLOCK, it is
2640 * possible that there is space for this request.
2641 */
2642 if (notinit || *blen < ap->minlen)
2643 args->minlen = ap->minlen;
2644 /*
2645 * If the best seen length is less than the request
2646 * length, use the best as the minimum.
2647 */
2648 else if (*blen < ap->alen)
2649 args->minlen = *blen;
2650 /*
2651 * Otherwise we've seen an extent as big as alen,
2652 * use that as the minimum.
2653 */
2654 else
2655 args->minlen = ap->alen;
2656
2657 /*
2658 * set the failure fallback case to look in the selected
2659 * AG as the stream may have moved.
2660 */
2661 if (xfs_inode_is_filestream(ap->ip))
2662 ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
2663
2664 return 0;
2665}
2666
2667STATIC int
2553xfs_bmap_btalloc( 2668xfs_bmap_btalloc(
2554 xfs_bmalloca_t *ap) /* bmap alloc argument struct */ 2669 xfs_bmalloca_t *ap) /* bmap alloc argument struct */
2555{ 2670{
2556 xfs_mount_t *mp; /* mount point structure */ 2671 xfs_mount_t *mp; /* mount point structure */
2557 xfs_alloctype_t atype = 0; /* type for allocation routines */ 2672 xfs_alloctype_t atype = 0; /* type for allocation routines */
2558 xfs_extlen_t align; /* minimum allocation alignment */ 2673 xfs_extlen_t align; /* minimum allocation alignment */
2559 xfs_agnumber_t ag;
2560 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ 2674 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
2561 xfs_agnumber_t startag; 2675 xfs_agnumber_t ag;
2562 xfs_alloc_arg_t args; 2676 xfs_alloc_arg_t args;
2563 xfs_extlen_t blen; 2677 xfs_extlen_t blen;
2564 xfs_extlen_t nextminlen = 0; 2678 xfs_extlen_t nextminlen = 0;
2565 xfs_perag_t *pag;
2566 int nullfb; /* true if ap->firstblock isn't set */ 2679 int nullfb; /* true if ap->firstblock isn't set */
2567 int isaligned; 2680 int isaligned;
2568 int notinit;
2569 int tryagain; 2681 int tryagain;
2570 int error; 2682 int error;
2571 2683
@@ -2612,102 +2724,9 @@ xfs_bmap_btalloc(
2612 args.firstblock = ap->firstblock; 2724 args.firstblock = ap->firstblock;
2613 blen = 0; 2725 blen = 0;
2614 if (nullfb) { 2726 if (nullfb) {
2615 if (ap->userdata && xfs_inode_is_filestream(ap->ip)) 2727 error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
2616 args.type = XFS_ALLOCTYPE_NEAR_BNO; 2728 if (error)
2617 else 2729 return error;
2618 args.type = XFS_ALLOCTYPE_START_BNO;
2619 args.total = ap->total;
2620
2621 /*
2622 * Search for an allocation group with a single extent
2623 * large enough for the request.
2624 *
2625 * If one isn't found, then adjust the minimum allocation
2626 * size to the largest space found.
2627 */
2628 startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno);
2629 if (startag == NULLAGNUMBER)
2630 startag = ag = 0;
2631 notinit = 0;
2632 down_read(&mp->m_peraglock);
2633 while (blen < ap->alen) {
2634 pag = &mp->m_perag[ag];
2635 if (!pag->pagf_init &&
2636 (error = xfs_alloc_pagf_init(mp, args.tp,
2637 ag, XFS_ALLOC_FLAG_TRYLOCK))) {
2638 up_read(&mp->m_peraglock);
2639 return error;
2640 }
2641 /*
2642 * See xfs_alloc_fix_freelist...
2643 */
2644 if (pag->pagf_init) {
2645 xfs_extlen_t longest;
2646 longest = xfs_alloc_longest_free_extent(mp, pag);
2647 if (blen < longest)
2648 blen = longest;
2649 } else
2650 notinit = 1;
2651
2652 if (xfs_inode_is_filestream(ap->ip)) {
2653 if (blen >= ap->alen)
2654 break;
2655
2656 if (ap->userdata) {
2657 /*
2658 * If startag is an invalid AG, we've
2659 * come here once before and
2660 * xfs_filestream_new_ag picked the
2661 * best currently available.
2662 *
2663 * Don't continue looping, since we
2664 * could loop forever.
2665 */
2666 if (startag == NULLAGNUMBER)
2667 break;
2668
2669 error = xfs_filestream_new_ag(ap, &ag);
2670 if (error) {
2671 up_read(&mp->m_peraglock);
2672 return error;
2673 }
2674
2675 /* loop again to set 'blen'*/
2676 startag = NULLAGNUMBER;
2677 continue;
2678 }
2679 }
2680 if (++ag == mp->m_sb.sb_agcount)
2681 ag = 0;
2682 if (ag == startag)
2683 break;
2684 }
2685 up_read(&mp->m_peraglock);
2686 /*
2687 * Since the above loop did a BUF_TRYLOCK, it is
2688 * possible that there is space for this request.
2689 */
2690 if (notinit || blen < ap->minlen)
2691 args.minlen = ap->minlen;
2692 /*
2693 * If the best seen length is less than the request
2694 * length, use the best as the minimum.
2695 */
2696 else if (blen < ap->alen)
2697 args.minlen = blen;
2698 /*
2699 * Otherwise we've seen an extent as big as alen,
2700 * use that as the minimum.
2701 */
2702 else
2703 args.minlen = ap->alen;
2704
2705 /*
2706 * set the failure fallback case to look in the selected
2707 * AG as the stream may have moved.
2708 */
2709 if (xfs_inode_is_filestream(ap->ip))
2710 ap->rval = args.fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
2711 } else if (ap->low) { 2730 } else if (ap->low) {
2712 if (xfs_inode_is_filestream(ap->ip)) 2731 if (xfs_inode_is_filestream(ap->ip))
2713 args.type = XFS_ALLOCTYPE_FIRST_AG; 2732 args.type = XFS_ALLOCTYPE_FIRST_AG;
@@ -4470,7 +4489,7 @@ xfs_bmapi(
4470 xfs_fsblock_t abno; /* allocated block number */ 4489 xfs_fsblock_t abno; /* allocated block number */
4471 xfs_extlen_t alen; /* allocated extent length */ 4490 xfs_extlen_t alen; /* allocated extent length */
4472 xfs_fileoff_t aoff; /* allocated file offset */ 4491 xfs_fileoff_t aoff; /* allocated file offset */
4473 xfs_bmalloca_t bma; /* args for xfs_bmap_alloc */ 4492 xfs_bmalloca_t bma = { 0 }; /* args for xfs_bmap_alloc */
4474 xfs_btree_cur_t *cur; /* bmap btree cursor */ 4493 xfs_btree_cur_t *cur; /* bmap btree cursor */
4475 xfs_fileoff_t end; /* end of mapped file region */ 4494 xfs_fileoff_t end; /* end of mapped file region */
4476 int eof; /* we've hit the end of extents */ 4495 int eof; /* we've hit the end of extents */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 38751d5fac6f..416e47e54b83 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -334,7 +334,7 @@ xfs_bmbt_disk_set_allf(
334/* 334/*
335 * Set all the fields in a bmap extent record from the uncompressed form. 335 * Set all the fields in a bmap extent record from the uncompressed form.
336 */ 336 */
337void 337STATIC void
338xfs_bmbt_disk_set_all( 338xfs_bmbt_disk_set_all(
339 xfs_bmbt_rec_t *r, 339 xfs_bmbt_rec_t *r,
340 xfs_bmbt_irec_t *s) 340 xfs_bmbt_irec_t *s)
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index cf07ca7c22e7..0e66c4ea0f85 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -223,7 +223,6 @@ extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v);
223extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v); 223extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v);
224extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v); 224extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v);
225 225
226extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
227extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o, 226extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
228 xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); 227 xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
229 228
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 36a0992dd669..96be4b0f2496 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -977,7 +977,7 @@ xfs_btree_get_buf_block(
977 xfs_daddr_t d; 977 xfs_daddr_t d;
978 978
979 /* need to sort out how callers deal with failures first */ 979 /* need to sort out how callers deal with failures first */
980 ASSERT(!(flags & XFS_BUF_TRYLOCK)); 980 ASSERT(!(flags & XBF_TRYLOCK));
981 981
982 d = xfs_btree_ptr_to_daddr(cur, ptr); 982 d = xfs_btree_ptr_to_daddr(cur, ptr);
983 *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, 983 *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
@@ -1008,7 +1008,7 @@ xfs_btree_read_buf_block(
1008 int error; 1008 int error;
1009 1009
1010 /* need to sort out how callers deal with failures first */ 1010 /* need to sort out how callers deal with failures first */
1011 ASSERT(!(flags & XFS_BUF_TRYLOCK)); 1011 ASSERT(!(flags & XBF_TRYLOCK));
1012 1012
1013 d = xfs_btree_ptr_to_daddr(cur, ptr); 1013 d = xfs_btree_ptr_to_daddr(cur, ptr);
1014 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, 1014 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a30f7e9eb2b9..f3c49e69eab9 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -250,7 +250,7 @@ xfs_buf_item_format(
250 ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); 250 ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
251 vecp->i_addr = (xfs_caddr_t)&bip->bli_format; 251 vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
252 vecp->i_len = base_size; 252 vecp->i_len = base_size;
253 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT); 253 vecp->i_type = XLOG_REG_TYPE_BFORMAT;
254 vecp++; 254 vecp++;
255 nvecs = 1; 255 nvecs = 1;
256 256
@@ -297,14 +297,14 @@ xfs_buf_item_format(
297 buffer_offset = first_bit * XFS_BLI_CHUNK; 297 buffer_offset = first_bit * XFS_BLI_CHUNK;
298 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 298 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
299 vecp->i_len = nbits * XFS_BLI_CHUNK; 299 vecp->i_len = nbits * XFS_BLI_CHUNK;
300 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); 300 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
301 nvecs++; 301 nvecs++;
302 break; 302 break;
303 } else if (next_bit != last_bit + 1) { 303 } else if (next_bit != last_bit + 1) {
304 buffer_offset = first_bit * XFS_BLI_CHUNK; 304 buffer_offset = first_bit * XFS_BLI_CHUNK;
305 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 305 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
306 vecp->i_len = nbits * XFS_BLI_CHUNK; 306 vecp->i_len = nbits * XFS_BLI_CHUNK;
307 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); 307 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
308 nvecs++; 308 nvecs++;
309 vecp++; 309 vecp++;
310 first_bit = next_bit; 310 first_bit = next_bit;
@@ -316,7 +316,7 @@ xfs_buf_item_format(
316 buffer_offset = first_bit * XFS_BLI_CHUNK; 316 buffer_offset = first_bit * XFS_BLI_CHUNK;
317 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 317 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
318 vecp->i_len = nbits * XFS_BLI_CHUNK; 318 vecp->i_len = nbits * XFS_BLI_CHUNK;
319 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); 319 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
320/* You would think we need to bump the nvecs here too, but we do not 320/* You would think we need to bump the nvecs here too, but we do not
321 * this number is used by recovery, and it gets confused by the boundary 321 * this number is used by recovery, and it gets confused by the boundary
322 * split here 322 * split here
@@ -467,8 +467,10 @@ xfs_buf_item_unpin_remove(
467/* 467/*
468 * This is called to attempt to lock the buffer associated with this 468 * This is called to attempt to lock the buffer associated with this
469 * buf log item. Don't sleep on the buffer lock. If we can't get 469 * buf log item. Don't sleep on the buffer lock. If we can't get
470 * the lock right away, return 0. If we can get the lock, pull the 470 * the lock right away, return 0. If we can get the lock, take a
471 * buffer from the free list, mark it busy, and return 1. 471 * reference to the buffer. If this is a delayed write buffer that
472 * needs AIL help to be written back, invoke the pushbuf routine
473 * rather than the normal success path.
472 */ 474 */
473STATIC uint 475STATIC uint
474xfs_buf_item_trylock( 476xfs_buf_item_trylock(
@@ -477,24 +479,18 @@ xfs_buf_item_trylock(
477 xfs_buf_t *bp; 479 xfs_buf_t *bp;
478 480
479 bp = bip->bli_buf; 481 bp = bip->bli_buf;
480 482 if (XFS_BUF_ISPINNED(bp))
481 if (XFS_BUF_ISPINNED(bp)) {
482 return XFS_ITEM_PINNED; 483 return XFS_ITEM_PINNED;
483 } 484 if (!XFS_BUF_CPSEMA(bp))
484
485 if (!XFS_BUF_CPSEMA(bp)) {
486 return XFS_ITEM_LOCKED; 485 return XFS_ITEM_LOCKED;
487 }
488 486
489 /* 487 /* take a reference to the buffer. */
490 * Remove the buffer from the free list. Only do this
491 * if it's on the free list. Private buffers like the
492 * superblock buffer are not.
493 */
494 XFS_BUF_HOLD(bp); 488 XFS_BUF_HOLD(bp);
495 489
496 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 490 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
497 trace_xfs_buf_item_trylock(bip); 491 trace_xfs_buf_item_trylock(bip);
492 if (XFS_BUF_ISDELAYWRITE(bp))
493 return XFS_ITEM_PUSHBUF;
498 return XFS_ITEM_SUCCESS; 494 return XFS_ITEM_SUCCESS;
499} 495}
500 496
@@ -626,11 +622,9 @@ xfs_buf_item_committed(
626} 622}
627 623
628/* 624/*
629 * This is called to asynchronously write the buffer associated with this 625 * The buffer is locked, but is not a delayed write buffer. This happens
630 * buf log item out to disk. The buffer will already have been locked by 626 * if we race with IO completion and hence we don't want to try to write it
631 * a successful call to xfs_buf_item_trylock(). If the buffer still has 627 * again. Just release the buffer.
632 * B_DELWRI set, then get it going out to disk with a call to bawrite().
633 * If not, then just release the buffer.
634 */ 628 */
635STATIC void 629STATIC void
636xfs_buf_item_push( 630xfs_buf_item_push(
@@ -642,17 +636,29 @@ xfs_buf_item_push(
642 trace_xfs_buf_item_push(bip); 636 trace_xfs_buf_item_push(bip);
643 637
644 bp = bip->bli_buf; 638 bp = bip->bli_buf;
639 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
640 xfs_buf_relse(bp);
641}
645 642
646 if (XFS_BUF_ISDELAYWRITE(bp)) { 643/*
647 int error; 644 * The buffer is locked and is a delayed write buffer. Promote the buffer
648 error = xfs_bawrite(bip->bli_item.li_mountp, bp); 645 * in the delayed write queue as the caller knows that they must invoke
649 if (error) 646 * the xfsbufd to get this buffer written. We have to unlock the buffer
650 xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp, 647 * to allow the xfsbufd to write it, too.
651 "xfs_buf_item_push: pushbuf error %d on bip %p, bp %p", 648 */
652 error, bip, bp); 649STATIC void
653 } else { 650xfs_buf_item_pushbuf(
654 xfs_buf_relse(bp); 651 xfs_buf_log_item_t *bip)
655 } 652{
653 xfs_buf_t *bp;
654
655 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
656 trace_xfs_buf_item_pushbuf(bip);
657
658 bp = bip->bli_buf;
659 ASSERT(XFS_BUF_ISDELAYWRITE(bp));
660 xfs_buf_delwri_promote(bp);
661 xfs_buf_relse(bp);
656} 662}
657 663
658/* ARGSUSED */ 664/* ARGSUSED */
@@ -677,7 +683,7 @@ static struct xfs_item_ops xfs_buf_item_ops = {
677 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) 683 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
678 xfs_buf_item_committed, 684 xfs_buf_item_committed,
679 .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push, 685 .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push,
680 .iop_pushbuf = NULL, 686 .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf,
681 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) 687 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
682 xfs_buf_item_committing 688 xfs_buf_item_committing
683}; 689};
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index c0c8869115b1..0ca556b4bf31 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1534,8 +1534,8 @@ xfs_da_hashname(const __uint8_t *name, int namelen)
1534enum xfs_dacmp 1534enum xfs_dacmp
1535xfs_da_compname( 1535xfs_da_compname(
1536 struct xfs_da_args *args, 1536 struct xfs_da_args *args,
1537 const char *name, 1537 const unsigned char *name,
1538 int len) 1538 int len)
1539{ 1539{
1540 return (args->namelen == len && memcmp(args->name, name, len) == 0) ? 1540 return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
1541 XFS_CMP_EXACT : XFS_CMP_DIFFERENT; 1541 XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 30cd08f56a3a..fe9f5a8c1d2a 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -209,7 +209,8 @@ typedef struct xfs_da_state {
209 */ 209 */
210struct xfs_nameops { 210struct xfs_nameops {
211 xfs_dahash_t (*hashname)(struct xfs_name *); 211 xfs_dahash_t (*hashname)(struct xfs_name *);
212 enum xfs_dacmp (*compname)(struct xfs_da_args *, const char *, int); 212 enum xfs_dacmp (*compname)(struct xfs_da_args *,
213 const unsigned char *, int);
213}; 214};
214 215
215 216
@@ -260,7 +261,7 @@ int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
260 261
261uint xfs_da_hashname(const __uint8_t *name_string, int name_length); 262uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
262enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, 263enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
263 const char *name, int len); 264 const unsigned char *name, int len);
264 265
265 266
266xfs_da_state_t *xfs_da_state_alloc(void); 267xfs_da_state_t *xfs_da_state_alloc(void);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index d1483a4f71b8..5bba29a07812 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -45,15 +45,21 @@
45#include "xfs_vnodeops.h" 45#include "xfs_vnodeops.h"
46#include "xfs_trace.h" 46#include "xfs_trace.h"
47 47
48
49static int xfs_swap_extents(
50 xfs_inode_t *ip, /* target inode */
51 xfs_inode_t *tip, /* tmp inode */
52 xfs_swapext_t *sxp);
53
48/* 54/*
49 * Syssgi interface for swapext 55 * ioctl interface for swapext
50 */ 56 */
51int 57int
52xfs_swapext( 58xfs_swapext(
53 xfs_swapext_t *sxp) 59 xfs_swapext_t *sxp)
54{ 60{
55 xfs_inode_t *ip, *tip; 61 xfs_inode_t *ip, *tip;
56 struct file *file, *target_file; 62 struct file *file, *tmp_file;
57 int error = 0; 63 int error = 0;
58 64
59 /* Pull information for the target fd */ 65 /* Pull information for the target fd */
@@ -68,56 +74,138 @@ xfs_swapext(
68 goto out_put_file; 74 goto out_put_file;
69 } 75 }
70 76
71 target_file = fget((int)sxp->sx_fdtmp); 77 tmp_file = fget((int)sxp->sx_fdtmp);
72 if (!target_file) { 78 if (!tmp_file) {
73 error = XFS_ERROR(EINVAL); 79 error = XFS_ERROR(EINVAL);
74 goto out_put_file; 80 goto out_put_file;
75 } 81 }
76 82
77 if (!(target_file->f_mode & FMODE_WRITE) || 83 if (!(tmp_file->f_mode & FMODE_WRITE) ||
78 (target_file->f_flags & O_APPEND)) { 84 (tmp_file->f_flags & O_APPEND)) {
79 error = XFS_ERROR(EBADF); 85 error = XFS_ERROR(EBADF);
80 goto out_put_target_file; 86 goto out_put_tmp_file;
81 } 87 }
82 88
83 if (IS_SWAPFILE(file->f_path.dentry->d_inode) || 89 if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
84 IS_SWAPFILE(target_file->f_path.dentry->d_inode)) { 90 IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) {
85 error = XFS_ERROR(EINVAL); 91 error = XFS_ERROR(EINVAL);
86 goto out_put_target_file; 92 goto out_put_tmp_file;
87 } 93 }
88 94
89 ip = XFS_I(file->f_path.dentry->d_inode); 95 ip = XFS_I(file->f_path.dentry->d_inode);
90 tip = XFS_I(target_file->f_path.dentry->d_inode); 96 tip = XFS_I(tmp_file->f_path.dentry->d_inode);
91 97
92 if (ip->i_mount != tip->i_mount) { 98 if (ip->i_mount != tip->i_mount) {
93 error = XFS_ERROR(EINVAL); 99 error = XFS_ERROR(EINVAL);
94 goto out_put_target_file; 100 goto out_put_tmp_file;
95 } 101 }
96 102
97 if (ip->i_ino == tip->i_ino) { 103 if (ip->i_ino == tip->i_ino) {
98 error = XFS_ERROR(EINVAL); 104 error = XFS_ERROR(EINVAL);
99 goto out_put_target_file; 105 goto out_put_tmp_file;
100 } 106 }
101 107
102 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 108 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
103 error = XFS_ERROR(EIO); 109 error = XFS_ERROR(EIO);
104 goto out_put_target_file; 110 goto out_put_tmp_file;
105 } 111 }
106 112
107 error = xfs_swap_extents(ip, tip, sxp); 113 error = xfs_swap_extents(ip, tip, sxp);
108 114
109 out_put_target_file: 115 out_put_tmp_file:
110 fput(target_file); 116 fput(tmp_file);
111 out_put_file: 117 out_put_file:
112 fput(file); 118 fput(file);
113 out: 119 out:
114 return error; 120 return error;
115} 121}
116 122
117int 123/*
124 * We need to check that the format of the data fork in the temporary inode is
125 * valid for the target inode before doing the swap. This is not a problem with
126 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
127 * data fork depending on the space the attribute fork is taking so we can get
128 * invalid formats on the target inode.
129 *
130 * E.g. target has space for 7 extents in extent format, temp inode only has
131 * space for 6. If we defragment down to 7 extents, then the tmp format is a
132 * btree, but when swapped it needs to be in extent format. Hence we can't just
133 * blindly swap data forks on attr2 filesystems.
134 *
135 * Note that we check the swap in both directions so that we don't end up with
136 * a corrupt temporary inode, either.
137 *
138 * Note that fixing the way xfs_fsr sets up the attribute fork in the source
139 * inode will prevent this situation from occurring, so all we do here is
140 * reject and log the attempt. basically we are putting the responsibility on
141 * userspace to get this right.
142 */
143static int
144xfs_swap_extents_check_format(
145 xfs_inode_t *ip, /* target inode */
146 xfs_inode_t *tip) /* tmp inode */
147{
148
149 /* Should never get a local format */
150 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
151 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
152 return EINVAL;
153
154 /*
155 * if the target inode has less extents that then temporary inode then
156 * why did userspace call us?
157 */
158 if (ip->i_d.di_nextents < tip->i_d.di_nextents)
159 return EINVAL;
160
161 /*
162 * if the target inode is in extent form and the temp inode is in btree
163 * form then we will end up with the target inode in the wrong format
164 * as we already know there are less extents in the temp inode.
165 */
166 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
167 tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
168 return EINVAL;
169
170 /* Check temp in extent form to max in target */
171 if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
172 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
173 return EINVAL;
174
175 /* Check target in extent form to max in temp */
176 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
177 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
178 return EINVAL;
179
180 /*
181 * If we are in a btree format, check that the temp root block will fit
182 * in the target and that it has enough extents to be in btree format
183 * in the target.
184 *
185 * Note that we have to be careful to allow btree->extent conversions
186 * (a common defrag case) which will occur when the temp inode is in
187 * extent format...
188 */
189 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
190 ((XFS_IFORK_BOFF(ip) &&
191 tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
192 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
193 return EINVAL;
194
195 /* Reciprocal target->temp btree format checks */
196 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
197 ((XFS_IFORK_BOFF(tip) &&
198 ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
199 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
200 return EINVAL;
201
202 return 0;
203}
204
205static int
118xfs_swap_extents( 206xfs_swap_extents(
119 xfs_inode_t *ip, 207 xfs_inode_t *ip, /* target inode */
120 xfs_inode_t *tip, 208 xfs_inode_t *tip, /* tmp inode */
121 xfs_swapext_t *sxp) 209 xfs_swapext_t *sxp)
122{ 210{
123 xfs_mount_t *mp; 211 xfs_mount_t *mp;
@@ -161,13 +249,6 @@ xfs_swap_extents(
161 goto out_unlock; 249 goto out_unlock;
162 } 250 }
163 251
164 /* Should never get a local format */
165 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
166 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
167 error = XFS_ERROR(EINVAL);
168 goto out_unlock;
169 }
170
171 if (VN_CACHED(VFS_I(tip)) != 0) { 252 if (VN_CACHED(VFS_I(tip)) != 0) {
172 error = xfs_flushinval_pages(tip, 0, -1, 253 error = xfs_flushinval_pages(tip, 0, -1,
173 FI_REMAPF_LOCKED); 254 FI_REMAPF_LOCKED);
@@ -189,13 +270,15 @@ xfs_swap_extents(
189 goto out_unlock; 270 goto out_unlock;
190 } 271 }
191 272
192 /* 273 trace_xfs_swap_extent_before(ip, 0);
193 * If the target has extended attributes, the tmp file 274 trace_xfs_swap_extent_before(tip, 1);
194 * must also in order to ensure the correct data fork 275
195 * format. 276 /* check inode formats now that data is flushed */
196 */ 277 error = xfs_swap_extents_check_format(ip, tip);
197 if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) { 278 if (error) {
198 error = XFS_ERROR(EINVAL); 279 xfs_fs_cmn_err(CE_NOTE, mp,
280 "%s: inode 0x%llx format is incompatible for exchanging.",
281 __FILE__, ip->i_ino);
199 goto out_unlock; 282 goto out_unlock;
200 } 283 }
201 284
@@ -276,6 +359,16 @@ xfs_swap_extents(
276 *tifp = *tempifp; /* struct copy */ 359 *tifp = *tempifp; /* struct copy */
277 360
278 /* 361 /*
362 * Fix the in-memory data fork values that are dependent on the fork
363 * offset in the inode. We can't assume they remain the same as attr2
364 * has dynamic fork offsets.
365 */
366 ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
367 (uint)sizeof(xfs_bmbt_rec_t);
368 tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
369 (uint)sizeof(xfs_bmbt_rec_t);
370
371 /*
279 * Fix the on-disk inode values 372 * Fix the on-disk inode values
280 */ 373 */
281 tmp = (__uint64_t)ip->i_d.di_nblocks; 374 tmp = (__uint64_t)ip->i_d.di_nblocks;
@@ -347,6 +440,8 @@ xfs_swap_extents(
347 440
348 error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); 441 error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
349 442
443 trace_xfs_swap_extent_after(ip, 0);
444 trace_xfs_swap_extent_after(tip, 1);
350out: 445out:
351 kmem_free(tempifp); 446 kmem_free(tempifp);
352 return error; 447 return error;
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
index 4f55a6306558..20bdd935c121 100644
--- a/fs/xfs/xfs_dfrag.h
+++ b/fs/xfs/xfs_dfrag.h
@@ -48,9 +48,6 @@ typedef struct xfs_swapext
48 */ 48 */
49int xfs_swapext(struct xfs_swapext *sx); 49int xfs_swapext(struct xfs_swapext *sx);
50 50
51int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
52 struct xfs_swapext *sxp);
53
54#endif /* __KERNEL__ */ 51#endif /* __KERNEL__ */
55 52
56#endif /* __XFS_DFRAG_H__ */ 53#endif /* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 93634a7e90e9..42520f041265 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -44,7 +44,7 @@
44#include "xfs_vnodeops.h" 44#include "xfs_vnodeops.h"
45#include "xfs_trace.h" 45#include "xfs_trace.h"
46 46
47struct xfs_name xfs_name_dotdot = {"..", 2}; 47struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2};
48 48
49/* 49/*
50 * ASCII case-insensitive (ie. A-Z) support for directories that was 50 * ASCII case-insensitive (ie. A-Z) support for directories that was
@@ -66,8 +66,8 @@ xfs_ascii_ci_hashname(
66STATIC enum xfs_dacmp 66STATIC enum xfs_dacmp
67xfs_ascii_ci_compname( 67xfs_ascii_ci_compname(
68 struct xfs_da_args *args, 68 struct xfs_da_args *args,
69 const char *name, 69 const unsigned char *name,
70 int len) 70 int len)
71{ 71{
72 enum xfs_dacmp result; 72 enum xfs_dacmp result;
73 int i; 73 int i;
@@ -247,7 +247,7 @@ xfs_dir_createname(
247int 247int
248xfs_dir_cilookup_result( 248xfs_dir_cilookup_result(
249 struct xfs_da_args *args, 249 struct xfs_da_args *args,
250 const char *name, 250 const unsigned char *name,
251 int len) 251 int len)
252{ 252{
253 if (args->cmpresult == XFS_CMP_DIFFERENT) 253 if (args->cmpresult == XFS_CMP_DIFFERENT)
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 1d9ef96f33aa..74a3b1057685 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -100,7 +100,7 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp,
100extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, 100extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
101 struct xfs_dabuf *bp); 101 struct xfs_dabuf *bp);
102 102
103extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const char *name, 103extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
104 int len); 104 const unsigned char *name, int len);
105 105
106#endif /* __XFS_DIR2_H__ */ 106#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index ddc4ecc7807f..779a267b0a84 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -57,8 +57,8 @@ static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
57void 57void
58xfs_dir_startup(void) 58xfs_dir_startup(void)
59{ 59{
60 xfs_dir_hash_dot = xfs_da_hashname(".", 1); 60 xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
61 xfs_dir_hash_dotdot = xfs_da_hashname("..", 2); 61 xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
62} 62}
63 63
64/* 64/*
@@ -513,8 +513,9 @@ xfs_dir2_block_getdents(
513 /* 513 /*
514 * If it didn't fit, set the final offset to here & return. 514 * If it didn't fit, set the final offset to here & return.
515 */ 515 */
516 if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff, 516 if (filldir(dirent, (char *)dep->name, dep->namelen,
517 be64_to_cpu(dep->inumber), DT_UNKNOWN)) { 517 cook & 0x7fffffff, be64_to_cpu(dep->inumber),
518 DT_UNKNOWN)) {
518 *offset = cook & 0x7fffffff; 519 *offset = cook & 0x7fffffff;
519 xfs_da_brelse(NULL, bp); 520 xfs_da_brelse(NULL, bp);
520 return 0; 521 return 0;
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 29f484c11b3a..e2d89854ec9e 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1081,7 +1081,7 @@ xfs_dir2_leaf_getdents(
1081 dep = (xfs_dir2_data_entry_t *)ptr; 1081 dep = (xfs_dir2_data_entry_t *)ptr;
1082 length = xfs_dir2_data_entsize(dep->namelen); 1082 length = xfs_dir2_data_entsize(dep->namelen);
1083 1083
1084 if (filldir(dirent, dep->name, dep->namelen, 1084 if (filldir(dirent, (char *)dep->name, dep->namelen,
1085 xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff, 1085 xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
1086 be64_to_cpu(dep->inumber), DT_UNKNOWN)) 1086 be64_to_cpu(dep->inumber), DT_UNKNOWN))
1087 break; 1087 break;
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index ce6e355199b5..78fc4d9ae756 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -65,7 +65,7 @@ static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
65/* 65/*
66 * Log entries from a freespace block. 66 * Log entries from a freespace block.
67 */ 67 */
68void 68STATIC void
69xfs_dir2_free_log_bests( 69xfs_dir2_free_log_bests(
70 xfs_trans_t *tp, /* transaction pointer */ 70 xfs_trans_t *tp, /* transaction pointer */
71 xfs_dabuf_t *bp, /* freespace buffer */ 71 xfs_dabuf_t *bp, /* freespace buffer */
diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h
index dde72db3d695..82dfe7147195 100644
--- a/fs/xfs/xfs_dir2_node.h
+++ b/fs/xfs/xfs_dir2_node.h
@@ -75,8 +75,6 @@ xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
75 return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp)); 75 return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp));
76} 76}
77 77
78extern void xfs_dir2_free_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
79 int first, int last);
80extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, 78extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
81 struct xfs_dabuf *lbp); 79 struct xfs_dabuf *lbp);
82extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count); 80extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 9d4f17a69676..c1a5945d463a 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -782,7 +782,7 @@ xfs_dir2_sf_getdents(
782 } 782 }
783 783
784 ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep)); 784 ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
785 if (filldir(dirent, sfep->name, sfep->namelen, 785 if (filldir(dirent, (char *)sfep->name, sfep->namelen,
786 off & 0x7fffffff, ino, DT_UNKNOWN)) { 786 off & 0x7fffffff, ino, DT_UNKNOWN)) {
787 *offset = off & 0x7fffffff; 787 *offset = off & 0x7fffffff;
788 return 0; 788 return 0;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 05a4bdd4be39..6f35ed1b39b9 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -82,7 +82,7 @@ xfs_efi_item_format(xfs_efi_log_item_t *efip,
82 82
83 log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format); 83 log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
84 log_vector->i_len = size; 84 log_vector->i_len = size;
85 XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFI_FORMAT); 85 log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
86 ASSERT(size >= sizeof(xfs_efi_log_format_t)); 86 ASSERT(size >= sizeof(xfs_efi_log_format_t));
87} 87}
88 88
@@ -406,7 +406,7 @@ xfs_efd_item_format(xfs_efd_log_item_t *efdp,
406 406
407 log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format); 407 log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
408 log_vector->i_len = size; 408 log_vector->i_len = size;
409 XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFD_FORMAT); 409 log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
410 ASSERT(size >= sizeof(xfs_efd_log_format_t)); 410 ASSERT(size >= sizeof(xfs_efd_log_format_t));
411} 411}
412 412
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index a631e1451abb..390850ee6603 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -140,6 +140,7 @@ _xfs_filestream_pick_ag(
140 int flags, 140 int flags,
141 xfs_extlen_t minlen) 141 xfs_extlen_t minlen)
142{ 142{
143 int streams, max_streams;
143 int err, trylock, nscan; 144 int err, trylock, nscan;
144 xfs_extlen_t longest, free, minfree, maxfree = 0; 145 xfs_extlen_t longest, free, minfree, maxfree = 0;
145 xfs_agnumber_t ag, max_ag = NULLAGNUMBER; 146 xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
@@ -155,15 +156,15 @@ _xfs_filestream_pick_ag(
155 trylock = XFS_ALLOC_FLAG_TRYLOCK; 156 trylock = XFS_ALLOC_FLAG_TRYLOCK;
156 157
157 for (nscan = 0; 1; nscan++) { 158 for (nscan = 0; 1; nscan++) {
158 159 pag = xfs_perag_get(mp, ag);
159 TRACE_AG_SCAN(mp, ag, xfs_filestream_peek_ag(mp, ag)); 160 TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms));
160
161 pag = mp->m_perag + ag;
162 161
163 if (!pag->pagf_init) { 162 if (!pag->pagf_init) {
164 err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); 163 err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
165 if (err && !trylock) 164 if (err && !trylock) {
165 xfs_perag_put(pag);
166 return err; 166 return err;
167 }
167 } 168 }
168 169
169 /* Might fail sometimes during the 1st pass with trylock set. */ 170 /* Might fail sometimes during the 1st pass with trylock set. */
@@ -173,6 +174,7 @@ _xfs_filestream_pick_ag(
173 /* Keep track of the AG with the most free blocks. */ 174 /* Keep track of the AG with the most free blocks. */
174 if (pag->pagf_freeblks > maxfree) { 175 if (pag->pagf_freeblks > maxfree) {
175 maxfree = pag->pagf_freeblks; 176 maxfree = pag->pagf_freeblks;
177 max_streams = atomic_read(&pag->pagf_fstrms);
176 max_ag = ag; 178 max_ag = ag;
177 } 179 }
178 180
@@ -195,6 +197,8 @@ _xfs_filestream_pick_ag(
195 197
196 /* Break out, retaining the reference on the AG. */ 198 /* Break out, retaining the reference on the AG. */
197 free = pag->pagf_freeblks; 199 free = pag->pagf_freeblks;
200 streams = atomic_read(&pag->pagf_fstrms);
201 xfs_perag_put(pag);
198 *agp = ag; 202 *agp = ag;
199 break; 203 break;
200 } 204 }
@@ -202,6 +206,7 @@ _xfs_filestream_pick_ag(
202 /* Drop the reference on this AG, it's not usable. */ 206 /* Drop the reference on this AG, it's not usable. */
203 xfs_filestream_put_ag(mp, ag); 207 xfs_filestream_put_ag(mp, ag);
204next_ag: 208next_ag:
209 xfs_perag_put(pag);
205 /* Move to the next AG, wrapping to AG 0 if necessary. */ 210 /* Move to the next AG, wrapping to AG 0 if necessary. */
206 if (++ag >= mp->m_sb.sb_agcount) 211 if (++ag >= mp->m_sb.sb_agcount)
207 ag = 0; 212 ag = 0;
@@ -229,6 +234,7 @@ next_ag:
229 if (max_ag != NULLAGNUMBER) { 234 if (max_ag != NULLAGNUMBER) {
230 xfs_filestream_get_ag(mp, max_ag); 235 xfs_filestream_get_ag(mp, max_ag);
231 TRACE_AG_PICK1(mp, max_ag, maxfree); 236 TRACE_AG_PICK1(mp, max_ag, maxfree);
237 streams = max_streams;
232 free = maxfree; 238 free = maxfree;
233 *agp = max_ag; 239 *agp = max_ag;
234 break; 240 break;
@@ -240,16 +246,14 @@ next_ag:
240 return 0; 246 return 0;
241 } 247 }
242 248
243 TRACE_AG_PICK2(mp, startag, *agp, xfs_filestream_peek_ag(mp, *agp), 249 TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags);
244 free, nscan, flags);
245 250
246 return 0; 251 return 0;
247} 252}
248 253
249/* 254/*
250 * Set the allocation group number for a file or a directory, updating inode 255 * Set the allocation group number for a file or a directory, updating inode
251 * references and per-AG references as appropriate. Must be called with the 256 * references and per-AG references as appropriate.
252 * m_peraglock held in read mode.
253 */ 257 */
254static int 258static int
255_xfs_filestream_update_ag( 259_xfs_filestream_update_ag(
@@ -451,20 +455,6 @@ xfs_filestream_unmount(
451} 455}
452 456
453/* 457/*
454 * If the mount point's m_perag array is going to be reallocated, all
455 * outstanding cache entries must be flushed to avoid accessing reference count
456 * addresses that have been freed. The call to xfs_filestream_flush() must be
457 * made inside the block that holds the m_peraglock in write mode to do the
458 * reallocation.
459 */
460void
461xfs_filestream_flush(
462 xfs_mount_t *mp)
463{
464 xfs_mru_cache_flush(mp->m_filestream);
465}
466
467/*
468 * Return the AG of the filestream the file or directory belongs to, or 458 * Return the AG of the filestream the file or directory belongs to, or
469 * NULLAGNUMBER otherwise. 459 * NULLAGNUMBER otherwise.
470 */ 460 */
@@ -526,7 +516,6 @@ xfs_filestream_associate(
526 516
527 mp = pip->i_mount; 517 mp = pip->i_mount;
528 cache = mp->m_filestream; 518 cache = mp->m_filestream;
529 down_read(&mp->m_peraglock);
530 519
531 /* 520 /*
532 * We have a problem, Houston. 521 * We have a problem, Houston.
@@ -543,10 +532,8 @@ xfs_filestream_associate(
543 * 532 *
544 * So, if we can't get the iolock without sleeping then just give up 533 * So, if we can't get the iolock without sleeping then just give up
545 */ 534 */
546 if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) { 535 if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
547 up_read(&mp->m_peraglock);
548 return 1; 536 return 1;
549 }
550 537
551 /* If the parent directory is already in the cache, use its AG. */ 538 /* If the parent directory is already in the cache, use its AG. */
552 item = xfs_mru_cache_lookup(cache, pip->i_ino); 539 item = xfs_mru_cache_lookup(cache, pip->i_ino);
@@ -601,7 +588,6 @@ exit_did_pick:
601 588
602exit: 589exit:
603 xfs_iunlock(pip, XFS_IOLOCK_EXCL); 590 xfs_iunlock(pip, XFS_IOLOCK_EXCL);
604 up_read(&mp->m_peraglock);
605 return -err; 591 return -err;
606} 592}
607 593
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 4aba67c5f64f..260f757bbc5d 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -79,12 +79,21 @@ extern ktrace_t *xfs_filestreams_trace_buf;
79 * the cache that reference per-ag array elements that have since been 79 * the cache that reference per-ag array elements that have since been
80 * reallocated. 80 * reallocated.
81 */ 81 */
82/*
83 * xfs_filestream_peek_ag is only used in tracing code
84 */
82static inline int 85static inline int
83xfs_filestream_peek_ag( 86xfs_filestream_peek_ag(
84 xfs_mount_t *mp, 87 xfs_mount_t *mp,
85 xfs_agnumber_t agno) 88 xfs_agnumber_t agno)
86{ 89{
87 return atomic_read(&mp->m_perag[agno].pagf_fstrms); 90 struct xfs_perag *pag;
91 int ret;
92
93 pag = xfs_perag_get(mp, agno);
94 ret = atomic_read(&pag->pagf_fstrms);
95 xfs_perag_put(pag);
96 return ret;
88} 97}
89 98
90static inline int 99static inline int
@@ -92,7 +101,13 @@ xfs_filestream_get_ag(
92 xfs_mount_t *mp, 101 xfs_mount_t *mp,
93 xfs_agnumber_t agno) 102 xfs_agnumber_t agno)
94{ 103{
95 return atomic_inc_return(&mp->m_perag[agno].pagf_fstrms); 104 struct xfs_perag *pag;
105 int ret;
106
107 pag = xfs_perag_get(mp, agno);
108 ret = atomic_inc_return(&pag->pagf_fstrms);
109 xfs_perag_put(pag);
110 return ret;
96} 111}
97 112
98static inline int 113static inline int
@@ -100,7 +115,13 @@ xfs_filestream_put_ag(
100 xfs_mount_t *mp, 115 xfs_mount_t *mp,
101 xfs_agnumber_t agno) 116 xfs_agnumber_t agno)
102{ 117{
103 return atomic_dec_return(&mp->m_perag[agno].pagf_fstrms); 118 struct xfs_perag *pag;
119 int ret;
120
121 pag = xfs_perag_get(mp, agno);
122 ret = atomic_dec_return(&pag->pagf_fstrms);
123 xfs_perag_put(pag);
124 return ret;
104} 125}
105 126
106/* allocation selection flags */ 127/* allocation selection flags */
@@ -114,7 +135,6 @@ int xfs_filestream_init(void);
114void xfs_filestream_uninit(void); 135void xfs_filestream_uninit(void);
115int xfs_filestream_mount(struct xfs_mount *mp); 136int xfs_filestream_mount(struct xfs_mount *mp);
116void xfs_filestream_unmount(struct xfs_mount *mp); 137void xfs_filestream_unmount(struct xfs_mount *mp);
117void xfs_filestream_flush(struct xfs_mount *mp);
118xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); 138xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
119int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip); 139int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip);
120void xfs_filestream_deassociate(struct xfs_inode *ip); 140void xfs_filestream_deassociate(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index f52ac276277e..7cf7220e7d5f 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -292,7 +292,8 @@ typedef struct xfs_bstat {
292 __s32 bs_extents; /* number of extents */ 292 __s32 bs_extents; /* number of extents */
293 __u32 bs_gen; /* generation count */ 293 __u32 bs_gen; /* generation count */
294 __u16 bs_projid; /* project id */ 294 __u16 bs_projid; /* project id */
295 unsigned char bs_pad[14]; /* pad space, unused */ 295 __u16 bs_forkoff; /* inode fork offset in bytes */
296 unsigned char bs_pad[12]; /* pad space, unused */
296 __u32 bs_dmevmask; /* DMIG event mask */ 297 __u32 bs_dmevmask; /* DMIG event mask */
297 __u16 bs_dmstate; /* DMIG state info */ 298 __u16 bs_dmstate; /* DMIG state info */
298 __u16 bs_aextents; /* attribute number of extents */ 299 __u16 bs_aextents; /* attribute number of extents */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a13919a6a364..37a6f62c57b6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -167,27 +167,14 @@ xfs_growfs_data_private(
167 } 167 }
168 new = nb - mp->m_sb.sb_dblocks; 168 new = nb - mp->m_sb.sb_dblocks;
169 oagcount = mp->m_sb.sb_agcount; 169 oagcount = mp->m_sb.sb_agcount;
170 if (nagcount > oagcount) {
171 void *new_perag, *old_perag;
172
173 xfs_filestream_flush(mp);
174
175 new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount,
176 KM_MAYFAIL);
177 if (!new_perag)
178 return XFS_ERROR(ENOMEM);
179
180 down_write(&mp->m_peraglock);
181 memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount);
182 old_perag = mp->m_perag;
183 mp->m_perag = new_perag;
184
185 mp->m_flags |= XFS_MOUNT_32BITINODES;
186 nagimax = xfs_initialize_perag(mp, nagcount);
187 up_write(&mp->m_peraglock);
188 170
189 kmem_free(old_perag); 171 /* allocate the new per-ag structures */
172 if (nagcount > oagcount) {
173 error = xfs_initialize_perag(mp, nagcount, &nagimax);
174 if (error)
175 return error;
190 } 176 }
177
191 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); 178 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
192 tp->t_flags |= XFS_TRANS_RESERVE; 179 tp->t_flags |= XFS_TRANS_RESERVE;
193 if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp), 180 if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
@@ -196,6 +183,11 @@ xfs_growfs_data_private(
196 return error; 183 return error;
197 } 184 }
198 185
186 /*
187 * Write new AG headers to disk. Non-transactional, but written
188 * synchronously so they are completed prior to the growfs transaction
189 * being logged.
190 */
199 nfree = 0; 191 nfree = 0;
200 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { 192 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
201 /* 193 /*
@@ -359,6 +351,12 @@ xfs_growfs_data_private(
359 goto error0; 351 goto error0;
360 } 352 }
361 } 353 }
354
355 /*
356 * Update changed superblock fields transactionally. These are not
357 * seen by the rest of the world until the transaction commit applies
358 * them atomically to the superblock.
359 */
362 if (nagcount > oagcount) 360 if (nagcount > oagcount)
363 xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount); 361 xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
364 if (nb > mp->m_sb.sb_dblocks) 362 if (nb > mp->m_sb.sb_dblocks)
@@ -369,9 +367,9 @@ xfs_growfs_data_private(
369 if (dpct) 367 if (dpct)
370 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); 368 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
371 error = xfs_trans_commit(tp, 0); 369 error = xfs_trans_commit(tp, 0);
372 if (error) { 370 if (error)
373 return error; 371 return error;
374 } 372
375 /* New allocation groups fully initialized, so update mount struct */ 373 /* New allocation groups fully initialized, so update mount struct */
376 if (nagimax) 374 if (nagimax)
377 mp->m_maxagi = nagimax; 375 mp->m_maxagi = nagimax;
@@ -381,6 +379,8 @@ xfs_growfs_data_private(
381 mp->m_maxicount = icount << mp->m_sb.sb_inopblog; 379 mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
382 } else 380 } else
383 mp->m_maxicount = 0; 381 mp->m_maxicount = 0;
382
383 /* update secondary superblocks. */
384 for (agno = 1; agno < nagcount; agno++) { 384 for (agno = 1; agno < nagcount; agno++) {
385 error = xfs_read_buf(mp, mp->m_ddev_targp, 385 error = xfs_read_buf(mp, mp->m_ddev_targp,
386 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), 386 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index cb907ba69c4c..9d884c127bb9 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -205,7 +205,7 @@ xfs_ialloc_inode_init(
205 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); 205 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
206 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 206 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
207 mp->m_bsize * blks_per_cluster, 207 mp->m_bsize * blks_per_cluster,
208 XFS_BUF_LOCK); 208 XBF_LOCK);
209 ASSERT(fbuf); 209 ASSERT(fbuf);
210 ASSERT(!XFS_BUF_GETERROR(fbuf)); 210 ASSERT(!XFS_BUF_GETERROR(fbuf));
211 211
@@ -253,6 +253,7 @@ xfs_ialloc_ag_alloc(
253 xfs_agino_t thisino; /* current inode number, for loop */ 253 xfs_agino_t thisino; /* current inode number, for loop */
254 int isaligned = 0; /* inode allocation at stripe unit */ 254 int isaligned = 0; /* inode allocation at stripe unit */
255 /* boundary */ 255 /* boundary */
256 struct xfs_perag *pag;
256 257
257 args.tp = tp; 258 args.tp = tp;
258 args.mp = tp->t_mountp; 259 args.mp = tp->t_mountp;
@@ -382,9 +383,9 @@ xfs_ialloc_ag_alloc(
382 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); 383 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
383 be32_add_cpu(&agi->agi_count, newlen); 384 be32_add_cpu(&agi->agi_count, newlen);
384 be32_add_cpu(&agi->agi_freecount, newlen); 385 be32_add_cpu(&agi->agi_freecount, newlen);
385 down_read(&args.mp->m_peraglock); 386 pag = xfs_perag_get(args.mp, agno);
386 args.mp->m_perag[agno].pagi_freecount += newlen; 387 pag->pagi_freecount += newlen;
387 up_read(&args.mp->m_peraglock); 388 xfs_perag_put(pag);
388 agi->agi_newino = cpu_to_be32(newino); 389 agi->agi_newino = cpu_to_be32(newino);
389 390
390 /* 391 /*
@@ -486,9 +487,8 @@ xfs_ialloc_ag_select(
486 */ 487 */
487 agno = pagno; 488 agno = pagno;
488 flags = XFS_ALLOC_FLAG_TRYLOCK; 489 flags = XFS_ALLOC_FLAG_TRYLOCK;
489 down_read(&mp->m_peraglock);
490 for (;;) { 490 for (;;) {
491 pag = &mp->m_perag[agno]; 491 pag = xfs_perag_get(mp, agno);
492 if (!pag->pagi_init) { 492 if (!pag->pagi_init) {
493 if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { 493 if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
494 agbp = NULL; 494 agbp = NULL;
@@ -527,7 +527,7 @@ xfs_ialloc_ag_select(
527 agbp = NULL; 527 agbp = NULL;
528 goto nextag; 528 goto nextag;
529 } 529 }
530 up_read(&mp->m_peraglock); 530 xfs_perag_put(pag);
531 return agbp; 531 return agbp;
532 } 532 }
533 } 533 }
@@ -535,22 +535,19 @@ unlock_nextag:
535 if (agbp) 535 if (agbp)
536 xfs_trans_brelse(tp, agbp); 536 xfs_trans_brelse(tp, agbp);
537nextag: 537nextag:
538 xfs_perag_put(pag);
538 /* 539 /*
539 * No point in iterating over the rest, if we're shutting 540 * No point in iterating over the rest, if we're shutting
540 * down. 541 * down.
541 */ 542 */
542 if (XFS_FORCED_SHUTDOWN(mp)) { 543 if (XFS_FORCED_SHUTDOWN(mp))
543 up_read(&mp->m_peraglock);
544 return NULL; 544 return NULL;
545 }
546 agno++; 545 agno++;
547 if (agno >= agcount) 546 if (agno >= agcount)
548 agno = 0; 547 agno = 0;
549 if (agno == pagno) { 548 if (agno == pagno) {
550 if (flags == 0) { 549 if (flags == 0)
551 up_read(&mp->m_peraglock);
552 return NULL; 550 return NULL;
553 }
554 flags = 0; 551 flags = 0;
555 } 552 }
556 } 553 }
@@ -672,6 +669,7 @@ xfs_dialloc(
672 xfs_agnumber_t tagno; /* testing allocation group number */ 669 xfs_agnumber_t tagno; /* testing allocation group number */
673 xfs_btree_cur_t *tcur; /* temp cursor */ 670 xfs_btree_cur_t *tcur; /* temp cursor */
674 xfs_inobt_rec_incore_t trec; /* temp inode allocation record */ 671 xfs_inobt_rec_incore_t trec; /* temp inode allocation record */
672 struct xfs_perag *pag;
675 673
676 674
677 if (*IO_agbp == NULL) { 675 if (*IO_agbp == NULL) {
@@ -771,13 +769,13 @@ nextag:
771 *inop = NULLFSINO; 769 *inop = NULLFSINO;
772 return noroom ? ENOSPC : 0; 770 return noroom ? ENOSPC : 0;
773 } 771 }
774 down_read(&mp->m_peraglock); 772 pag = xfs_perag_get(mp, tagno);
775 if (mp->m_perag[tagno].pagi_inodeok == 0) { 773 if (pag->pagi_inodeok == 0) {
776 up_read(&mp->m_peraglock); 774 xfs_perag_put(pag);
777 goto nextag; 775 goto nextag;
778 } 776 }
779 error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp); 777 error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
780 up_read(&mp->m_peraglock); 778 xfs_perag_put(pag);
781 if (error) 779 if (error)
782 goto nextag; 780 goto nextag;
783 agi = XFS_BUF_TO_AGI(agbp); 781 agi = XFS_BUF_TO_AGI(agbp);
@@ -790,6 +788,7 @@ nextag:
790 */ 788 */
791 agno = tagno; 789 agno = tagno;
792 *IO_agbp = NULL; 790 *IO_agbp = NULL;
791 pag = xfs_perag_get(mp, agno);
793 792
794 restart_pagno: 793 restart_pagno:
795 cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno)); 794 cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
@@ -808,7 +807,6 @@ nextag:
808 * If in the same AG as the parent, try to get near the parent. 807 * If in the same AG as the parent, try to get near the parent.
809 */ 808 */
810 if (pagno == agno) { 809 if (pagno == agno) {
811 xfs_perag_t *pag = &mp->m_perag[agno];
812 int doneleft; /* done, to the left */ 810 int doneleft; /* done, to the left */
813 int doneright; /* done, to the right */ 811 int doneright; /* done, to the right */
814 int searchdistance = 10; 812 int searchdistance = 10;
@@ -1006,9 +1004,7 @@ alloc_inode:
1006 goto error0; 1004 goto error0;
1007 be32_add_cpu(&agi->agi_freecount, -1); 1005 be32_add_cpu(&agi->agi_freecount, -1);
1008 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); 1006 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1009 down_read(&mp->m_peraglock); 1007 pag->pagi_freecount--;
1010 mp->m_perag[tagno].pagi_freecount--;
1011 up_read(&mp->m_peraglock);
1012 1008
1013 error = xfs_check_agi_freecount(cur, agi); 1009 error = xfs_check_agi_freecount(cur, agi);
1014 if (error) 1010 if (error)
@@ -1016,12 +1012,14 @@ alloc_inode:
1016 1012
1017 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1013 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1018 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); 1014 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1015 xfs_perag_put(pag);
1019 *inop = ino; 1016 *inop = ino;
1020 return 0; 1017 return 0;
1021error1: 1018error1:
1022 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 1019 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1023error0: 1020error0:
1024 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 1021 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1022 xfs_perag_put(pag);
1025 return error; 1023 return error;
1026} 1024}
1027 1025
@@ -1052,6 +1050,7 @@ xfs_difree(
1052 xfs_mount_t *mp; /* mount structure for filesystem */ 1050 xfs_mount_t *mp; /* mount structure for filesystem */
1053 int off; /* offset of inode in inode chunk */ 1051 int off; /* offset of inode in inode chunk */
1054 xfs_inobt_rec_incore_t rec; /* btree record */ 1052 xfs_inobt_rec_incore_t rec; /* btree record */
1053 struct xfs_perag *pag;
1055 1054
1056 mp = tp->t_mountp; 1055 mp = tp->t_mountp;
1057 1056
@@ -1088,9 +1087,7 @@ xfs_difree(
1088 /* 1087 /*
1089 * Get the allocation group header. 1088 * Get the allocation group header.
1090 */ 1089 */
1091 down_read(&mp->m_peraglock);
1092 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 1090 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1093 up_read(&mp->m_peraglock);
1094 if (error) { 1091 if (error) {
1095 cmn_err(CE_WARN, 1092 cmn_err(CE_WARN,
1096 "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.", 1093 "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.",
@@ -1157,9 +1154,9 @@ xfs_difree(
1157 be32_add_cpu(&agi->agi_count, -ilen); 1154 be32_add_cpu(&agi->agi_count, -ilen);
1158 be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); 1155 be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
1159 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); 1156 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
1160 down_read(&mp->m_peraglock); 1157 pag = xfs_perag_get(mp, agno);
1161 mp->m_perag[agno].pagi_freecount -= ilen - 1; 1158 pag->pagi_freecount -= ilen - 1;
1162 up_read(&mp->m_peraglock); 1159 xfs_perag_put(pag);
1163 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); 1160 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
1164 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); 1161 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
1165 1162
@@ -1188,9 +1185,9 @@ xfs_difree(
1188 */ 1185 */
1189 be32_add_cpu(&agi->agi_freecount, 1); 1186 be32_add_cpu(&agi->agi_freecount, 1);
1190 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); 1187 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1191 down_read(&mp->m_peraglock); 1188 pag = xfs_perag_get(mp, agno);
1192 mp->m_perag[agno].pagi_freecount++; 1189 pag->pagi_freecount++;
1193 up_read(&mp->m_peraglock); 1190 xfs_perag_put(pag);
1194 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); 1191 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
1195 } 1192 }
1196 1193
@@ -1312,9 +1309,7 @@ xfs_imap(
1312 xfs_buf_t *agbp; /* agi buffer */ 1309 xfs_buf_t *agbp; /* agi buffer */
1313 int i; /* temp state */ 1310 int i; /* temp state */
1314 1311
1315 down_read(&mp->m_peraglock);
1316 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 1312 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1317 up_read(&mp->m_peraglock);
1318 if (error) { 1313 if (error) {
1319 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1314 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1320 "xfs_ialloc_read_agi() returned " 1315 "xfs_ialloc_read_agi() returned "
@@ -1379,7 +1374,6 @@ xfs_imap(
1379 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); 1374 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
1380 return XFS_ERROR(EINVAL); 1375 return XFS_ERROR(EINVAL);
1381 } 1376 }
1382
1383 return 0; 1377 return 0;
1384} 1378}
1385 1379
@@ -1523,8 +1517,7 @@ xfs_ialloc_read_agi(
1523 return error; 1517 return error;
1524 1518
1525 agi = XFS_BUF_TO_AGI(*bpp); 1519 agi = XFS_BUF_TO_AGI(*bpp);
1526 pag = &mp->m_perag[agno]; 1520 pag = xfs_perag_get(mp, agno);
1527
1528 if (!pag->pagi_init) { 1521 if (!pag->pagi_init) {
1529 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); 1522 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
1530 pag->pagi_count = be32_to_cpu(agi->agi_count); 1523 pag->pagi_count = be32_to_cpu(agi->agi_count);
@@ -1537,6 +1530,7 @@ xfs_ialloc_read_agi(
1537 */ 1530 */
1538 ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || 1531 ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
1539 XFS_FORCED_SHUTDOWN(mp)); 1532 XFS_FORCED_SHUTDOWN(mp));
1533 xfs_perag_put(pag);
1540 return 0; 1534 return 0;
1541} 1535}
1542 1536
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index fa402a6bbbcf..6845db90818f 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -73,7 +73,6 @@ xfs_inode_alloc(
73 ASSERT(atomic_read(&ip->i_pincount) == 0); 73 ASSERT(atomic_read(&ip->i_pincount) == 0);
74 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 74 ASSERT(!spin_is_locked(&ip->i_flags_lock));
75 ASSERT(completion_done(&ip->i_flush)); 75 ASSERT(completion_done(&ip->i_flush));
76 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
77 76
78 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 77 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
79 78
@@ -191,13 +190,12 @@ xfs_iget_cache_hit(
191 trace_xfs_iget_reclaim(ip); 190 trace_xfs_iget_reclaim(ip);
192 191
193 /* 192 /*
194 * We need to set XFS_INEW atomically with clearing the 193 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
195 * reclaimable tag so that we do have an indicator of the 194 * from stomping over us while we recycle the inode. We can't
196 * inode still being initialized. 195 * clear the radix tree reclaimable tag yet as it requires
196 * pag_ici_lock to be held exclusive.
197 */ 197 */
198 ip->i_flags |= XFS_INEW; 198 ip->i_flags |= XFS_IRECLAIM;
199 ip->i_flags &= ~XFS_IRECLAIMABLE;
200 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
201 199
202 spin_unlock(&ip->i_flags_lock); 200 spin_unlock(&ip->i_flags_lock);
203 read_unlock(&pag->pag_ici_lock); 201 read_unlock(&pag->pag_ici_lock);
@@ -217,7 +215,15 @@ xfs_iget_cache_hit(
217 trace_xfs_iget_reclaim(ip); 215 trace_xfs_iget_reclaim(ip);
218 goto out_error; 216 goto out_error;
219 } 217 }
218
219 write_lock(&pag->pag_ici_lock);
220 spin_lock(&ip->i_flags_lock);
221 ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
222 ip->i_flags |= XFS_INEW;
223 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
220 inode->i_state = I_NEW; 224 inode->i_state = I_NEW;
225 spin_unlock(&ip->i_flags_lock);
226 write_unlock(&pag->pag_ici_lock);
221 } else { 227 } else {
222 /* If the VFS inode is being torn down, pause and try again. */ 228 /* If the VFS inode is being torn down, pause and try again. */
223 if (!igrab(inode)) { 229 if (!igrab(inode)) {
@@ -375,7 +381,7 @@ xfs_iget(
375 return EINVAL; 381 return EINVAL;
376 382
377 /* get the perag structure and ensure that it's inode capable */ 383 /* get the perag structure and ensure that it's inode capable */
378 pag = xfs_get_perag(mp, ino); 384 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
379 if (!pag->pagi_inodeok) 385 if (!pag->pagi_inodeok)
380 return EINVAL; 386 return EINVAL;
381 ASSERT(pag->pag_ici_init); 387 ASSERT(pag->pag_ici_init);
@@ -399,7 +405,7 @@ again:
399 if (error) 405 if (error)
400 goto out_error_or_again; 406 goto out_error_or_again;
401 } 407 }
402 xfs_put_perag(mp, pag); 408 xfs_perag_put(pag);
403 409
404 *ipp = ip; 410 *ipp = ip;
405 411
@@ -418,7 +424,7 @@ out_error_or_again:
418 delay(1); 424 delay(1);
419 goto again; 425 goto again;
420 } 426 }
421 xfs_put_perag(mp, pag); 427 xfs_perag_put(pag);
422 return error; 428 return error;
423} 429}
424 430
@@ -489,12 +495,12 @@ xfs_ireclaim(
489 * added to the tree assert that it's been there before to catch 495 * added to the tree assert that it's been there before to catch
490 * problems with the inode life time early on. 496 * problems with the inode life time early on.
491 */ 497 */
492 pag = xfs_get_perag(mp, ip->i_ino); 498 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
493 write_lock(&pag->pag_ici_lock); 499 write_lock(&pag->pag_ici_lock);
494 if (!radix_tree_delete(&pag->pag_ici_root, agino)) 500 if (!radix_tree_delete(&pag->pag_ici_root, agino))
495 ASSERT(0); 501 ASSERT(0);
496 write_unlock(&pag->pag_ici_lock); 502 write_unlock(&pag->pag_ici_lock);
497 xfs_put_perag(mp, pag); 503 xfs_perag_put(pag);
498 504
499 /* 505 /*
500 * Here we do an (almost) spurious inode lock in order to coordinate 506 * Here we do an (almost) spurious inode lock in order to coordinate
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ce278b3ae7fc..0ffd56447045 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -151,7 +151,7 @@ xfs_imap_to_bp(
151 "an error %d on %s. Returning error.", 151 "an error %d on %s. Returning error.",
152 error, mp->m_fsname); 152 error, mp->m_fsname);
153 } else { 153 } else {
154 ASSERT(buf_flags & XFS_BUF_TRYLOCK); 154 ASSERT(buf_flags & XBF_TRYLOCK);
155 } 155 }
156 return error; 156 return error;
157 } 157 }
@@ -239,7 +239,7 @@ xfs_inotobp(
239 if (error) 239 if (error)
240 return error; 240 return error;
241 241
242 error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags); 242 error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags);
243 if (error) 243 if (error)
244 return error; 244 return error;
245 245
@@ -285,7 +285,7 @@ xfs_itobp(
285 return error; 285 return error;
286 286
287 if (!bp) { 287 if (!bp) {
288 ASSERT(buf_flags & XFS_BUF_TRYLOCK); 288 ASSERT(buf_flags & XBF_TRYLOCK);
289 ASSERT(tp == NULL); 289 ASSERT(tp == NULL);
290 *bpp = NULL; 290 *bpp = NULL;
291 return EAGAIN; 291 return EAGAIN;
@@ -807,7 +807,7 @@ xfs_iread(
807 * Get pointers to the on-disk inode and the buffer containing it. 807 * Get pointers to the on-disk inode and the buffer containing it.
808 */ 808 */
809 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 809 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
810 XFS_BUF_LOCK, iget_flags); 810 XBF_LOCK, iget_flags);
811 if (error) 811 if (error)
812 return error; 812 return error;
813 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 813 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -1751,7 +1751,7 @@ xfs_iunlink(
1751 * Here we put the head pointer into our next pointer, 1751 * Here we put the head pointer into our next pointer,
1752 * and then we fall through to point the head at us. 1752 * and then we fall through to point the head at us.
1753 */ 1753 */
1754 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); 1754 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1755 if (error) 1755 if (error)
1756 return error; 1756 return error;
1757 1757
@@ -1833,7 +1833,7 @@ xfs_iunlink_remove(
1833 * of dealing with the buffer when there is no need to 1833 * of dealing with the buffer when there is no need to
1834 * change it. 1834 * change it.
1835 */ 1835 */
1836 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); 1836 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1837 if (error) { 1837 if (error) {
1838 cmn_err(CE_WARN, 1838 cmn_err(CE_WARN,
1839 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1839 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -1895,7 +1895,7 @@ xfs_iunlink_remove(
1895 * Now last_ibp points to the buffer previous to us on 1895 * Now last_ibp points to the buffer previous to us on
1896 * the unlinked list. Pull us from the list. 1896 * the unlinked list. Pull us from the list.
1897 */ 1897 */
1898 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); 1898 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1899 if (error) { 1899 if (error) {
1900 cmn_err(CE_WARN, 1900 cmn_err(CE_WARN,
1901 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1901 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -1946,8 +1946,9 @@ xfs_ifree_cluster(
1946 xfs_inode_t *ip, **ip_found; 1946 xfs_inode_t *ip, **ip_found;
1947 xfs_inode_log_item_t *iip; 1947 xfs_inode_log_item_t *iip;
1948 xfs_log_item_t *lip; 1948 xfs_log_item_t *lip;
1949 xfs_perag_t *pag = xfs_get_perag(mp, inum); 1949 struct xfs_perag *pag;
1950 1950
1951 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
1951 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 1952 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
1952 blks_per_cluster = 1; 1953 blks_per_cluster = 1;
1953 ninodes = mp->m_sb.sb_inopblock; 1954 ninodes = mp->m_sb.sb_inopblock;
@@ -2039,7 +2040,7 @@ xfs_ifree_cluster(
2039 2040
2040 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2041 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2041 mp->m_bsize * blks_per_cluster, 2042 mp->m_bsize * blks_per_cluster,
2042 XFS_BUF_LOCK); 2043 XBF_LOCK);
2043 2044
2044 pre_flushed = 0; 2045 pre_flushed = 0;
2045 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 2046 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
@@ -2088,7 +2089,7 @@ xfs_ifree_cluster(
2088 } 2089 }
2089 2090
2090 kmem_free(ip_found); 2091 kmem_free(ip_found);
2091 xfs_put_perag(mp, pag); 2092 xfs_perag_put(pag);
2092} 2093}
2093 2094
2094/* 2095/*
@@ -2150,7 +2151,7 @@ xfs_ifree(
2150 2151
2151 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2152 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2152 2153
2153 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XFS_BUF_LOCK); 2154 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK);
2154 if (error) 2155 if (error)
2155 return error; 2156 return error;
2156 2157
@@ -2438,72 +2439,31 @@ xfs_idestroy_fork(
2438} 2439}
2439 2440
2440/* 2441/*
2441 * Increment the pin count of the given buffer. 2442 * This is called to unpin an inode. The caller must have the inode locked
2442 * This value is protected by ipinlock spinlock in the mount structure. 2443 * in at least shared mode so that the buffer cannot be subsequently pinned
2444 * once someone is waiting for it to be unpinned.
2443 */ 2445 */
2444void 2446static void
2445xfs_ipin( 2447xfs_iunpin_nowait(
2446 xfs_inode_t *ip) 2448 struct xfs_inode *ip)
2447{
2448 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2449
2450 atomic_inc(&ip->i_pincount);
2451}
2452
2453/*
2454 * Decrement the pin count of the given inode, and wake up
2455 * anyone in xfs_iwait_unpin() if the count goes to 0. The
2456 * inode must have been previously pinned with a call to xfs_ipin().
2457 */
2458void
2459xfs_iunpin(
2460 xfs_inode_t *ip)
2461{
2462 ASSERT(atomic_read(&ip->i_pincount) > 0);
2463
2464 if (atomic_dec_and_test(&ip->i_pincount))
2465 wake_up(&ip->i_ipin_wait);
2466}
2467
2468/*
2469 * This is called to unpin an inode. It can be directed to wait or to return
2470 * immediately without waiting for the inode to be unpinned. The caller must
2471 * have the inode locked in at least shared mode so that the buffer cannot be
2472 * subsequently pinned once someone is waiting for it to be unpinned.
2473 */
2474STATIC void
2475__xfs_iunpin_wait(
2476 xfs_inode_t *ip,
2477 int wait)
2478{ 2449{
2479 xfs_inode_log_item_t *iip = ip->i_itemp;
2480
2481 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2450 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2482 if (atomic_read(&ip->i_pincount) == 0)
2483 return;
2484 2451
2485 /* Give the log a push to start the unpinning I/O */ 2452 /* Give the log a push to start the unpinning I/O */
2486 xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ? 2453 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
2487 iip->ili_last_lsn : 0, XFS_LOG_FORCE);
2488 if (wait)
2489 wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
2490}
2491 2454
2492static inline void
2493xfs_iunpin_wait(
2494 xfs_inode_t *ip)
2495{
2496 __xfs_iunpin_wait(ip, 1);
2497} 2455}
2498 2456
2499static inline void 2457void
2500xfs_iunpin_nowait( 2458xfs_iunpin_wait(
2501 xfs_inode_t *ip) 2459 struct xfs_inode *ip)
2502{ 2460{
2503 __xfs_iunpin_wait(ip, 0); 2461 if (xfs_ipincount(ip)) {
2462 xfs_iunpin_nowait(ip);
2463 wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
2464 }
2504} 2465}
2505 2466
2506
2507/* 2467/*
2508 * xfs_iextents_copy() 2468 * xfs_iextents_copy()
2509 * 2469 *
@@ -2675,7 +2635,7 @@ xfs_iflush_cluster(
2675 xfs_buf_t *bp) 2635 xfs_buf_t *bp)
2676{ 2636{
2677 xfs_mount_t *mp = ip->i_mount; 2637 xfs_mount_t *mp = ip->i_mount;
2678 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); 2638 struct xfs_perag *pag;
2679 unsigned long first_index, mask; 2639 unsigned long first_index, mask;
2680 unsigned long inodes_per_cluster; 2640 unsigned long inodes_per_cluster;
2681 int ilist_size; 2641 int ilist_size;
@@ -2686,6 +2646,7 @@ xfs_iflush_cluster(
2686 int bufwasdelwri; 2646 int bufwasdelwri;
2687 int i; 2647 int i;
2688 2648
2649 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2689 ASSERT(pag->pagi_inodeok); 2650 ASSERT(pag->pagi_inodeok);
2690 ASSERT(pag->pag_ici_init); 2651 ASSERT(pag->pag_ici_init);
2691 2652
@@ -2693,7 +2654,7 @@ xfs_iflush_cluster(
2693 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); 2654 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
2694 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); 2655 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
2695 if (!ilist) 2656 if (!ilist)
2696 return 0; 2657 goto out_put;
2697 2658
2698 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2659 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2699 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2660 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
@@ -2762,6 +2723,8 @@ xfs_iflush_cluster(
2762out_free: 2723out_free:
2763 read_unlock(&pag->pag_ici_lock); 2724 read_unlock(&pag->pag_ici_lock);
2764 kmem_free(ilist); 2725 kmem_free(ilist);
2726out_put:
2727 xfs_perag_put(pag);
2765 return 0; 2728 return 0;
2766 2729
2767 2730
@@ -2805,6 +2768,7 @@ cluster_corrupt_out:
2805 */ 2768 */
2806 xfs_iflush_abort(iq); 2769 xfs_iflush_abort(iq);
2807 kmem_free(ilist); 2770 kmem_free(ilist);
2771 xfs_perag_put(pag);
2808 return XFS_ERROR(EFSCORRUPTED); 2772 return XFS_ERROR(EFSCORRUPTED);
2809} 2773}
2810 2774
@@ -2827,8 +2791,6 @@ xfs_iflush(
2827 xfs_dinode_t *dip; 2791 xfs_dinode_t *dip;
2828 xfs_mount_t *mp; 2792 xfs_mount_t *mp;
2829 int error; 2793 int error;
2830 int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
2831 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
2832 2794
2833 XFS_STATS_INC(xs_iflush_count); 2795 XFS_STATS_INC(xs_iflush_count);
2834 2796
@@ -2841,15 +2803,6 @@ xfs_iflush(
2841 mp = ip->i_mount; 2803 mp = ip->i_mount;
2842 2804
2843 /* 2805 /*
2844 * If the inode isn't dirty, then just release the inode
2845 * flush lock and do nothing.
2846 */
2847 if (xfs_inode_clean(ip)) {
2848 xfs_ifunlock(ip);
2849 return 0;
2850 }
2851
2852 /*
2853 * We can't flush the inode until it is unpinned, so wait for it if we 2806 * We can't flush the inode until it is unpinned, so wait for it if we
2854 * are allowed to block. We know noone new can pin it, because we are 2807 * are allowed to block. We know noone new can pin it, because we are
2855 * holding the inode lock shared and you need to hold it exclusively to 2808 * holding the inode lock shared and you need to hold it exclusively to
@@ -2860,7 +2813,7 @@ xfs_iflush(
2860 * in the same cluster are dirty, they will probably write the inode 2813 * in the same cluster are dirty, they will probably write the inode
2861 * out for us if they occur after the log force completes. 2814 * out for us if they occur after the log force completes.
2862 */ 2815 */
2863 if (noblock && xfs_ipincount(ip)) { 2816 if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
2864 xfs_iunpin_nowait(ip); 2817 xfs_iunpin_nowait(ip);
2865 xfs_ifunlock(ip); 2818 xfs_ifunlock(ip);
2866 return EAGAIN; 2819 return EAGAIN;
@@ -2868,6 +2821,19 @@ xfs_iflush(
2868 xfs_iunpin_wait(ip); 2821 xfs_iunpin_wait(ip);
2869 2822
2870 /* 2823 /*
2824 * For stale inodes we cannot rely on the backing buffer remaining
2825 * stale in cache for the remaining life of the stale inode and so
2826 * xfs_itobp() below may give us a buffer that no longer contains
2827 * inodes below. We have to check this after ensuring the inode is
2828 * unpinned so that it is safe to reclaim the stale inode after the
2829 * flush call.
2830 */
2831 if (xfs_iflags_test(ip, XFS_ISTALE)) {
2832 xfs_ifunlock(ip);
2833 return 0;
2834 }
2835
2836 /*
2871 * This may have been unpinned because the filesystem is shutting 2837 * This may have been unpinned because the filesystem is shutting
2872 * down forcibly. If that's the case we must not write this inode 2838 * down forcibly. If that's the case we must not write this inode
2873 * to disk, because the log record didn't make it to disk! 2839 * to disk, because the log record didn't make it to disk!
@@ -2881,60 +2847,10 @@ xfs_iflush(
2881 } 2847 }
2882 2848
2883 /* 2849 /*
2884 * Decide how buffer will be flushed out. This is done before
2885 * the call to xfs_iflush_int because this field is zeroed by it.
2886 */
2887 if (iip != NULL && iip->ili_format.ilf_fields != 0) {
2888 /*
2889 * Flush out the inode buffer according to the directions
2890 * of the caller. In the cases where the caller has given
2891 * us a choice choose the non-delwri case. This is because
2892 * the inode is in the AIL and we need to get it out soon.
2893 */
2894 switch (flags) {
2895 case XFS_IFLUSH_SYNC:
2896 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2897 flags = 0;
2898 break;
2899 case XFS_IFLUSH_ASYNC_NOBLOCK:
2900 case XFS_IFLUSH_ASYNC:
2901 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2902 flags = INT_ASYNC;
2903 break;
2904 case XFS_IFLUSH_DELWRI:
2905 flags = INT_DELWRI;
2906 break;
2907 default:
2908 ASSERT(0);
2909 flags = 0;
2910 break;
2911 }
2912 } else {
2913 switch (flags) {
2914 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
2915 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
2916 case XFS_IFLUSH_DELWRI:
2917 flags = INT_DELWRI;
2918 break;
2919 case XFS_IFLUSH_ASYNC_NOBLOCK:
2920 case XFS_IFLUSH_ASYNC:
2921 flags = INT_ASYNC;
2922 break;
2923 case XFS_IFLUSH_SYNC:
2924 flags = 0;
2925 break;
2926 default:
2927 ASSERT(0);
2928 flags = 0;
2929 break;
2930 }
2931 }
2932
2933 /*
2934 * Get the buffer containing the on-disk inode. 2850 * Get the buffer containing the on-disk inode.
2935 */ 2851 */
2936 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2852 error = xfs_itobp(mp, NULL, ip, &dip, &bp,
2937 noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK); 2853 (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK);
2938 if (error || !bp) { 2854 if (error || !bp) {
2939 xfs_ifunlock(ip); 2855 xfs_ifunlock(ip);
2940 return error; 2856 return error;
@@ -2952,7 +2868,7 @@ xfs_iflush(
2952 * get stuck waiting in the write for too long. 2868 * get stuck waiting in the write for too long.
2953 */ 2869 */
2954 if (XFS_BUF_ISPINNED(bp)) 2870 if (XFS_BUF_ISPINNED(bp))
2955 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 2871 xfs_log_force(mp, 0);
2956 2872
2957 /* 2873 /*
2958 * inode clustering: 2874 * inode clustering:
@@ -2962,13 +2878,10 @@ xfs_iflush(
2962 if (error) 2878 if (error)
2963 goto cluster_corrupt_out; 2879 goto cluster_corrupt_out;
2964 2880
2965 if (flags & INT_DELWRI) { 2881 if (flags & SYNC_WAIT)
2966 xfs_bdwrite(mp, bp);
2967 } else if (flags & INT_ASYNC) {
2968 error = xfs_bawrite(mp, bp);
2969 } else {
2970 error = xfs_bwrite(mp, bp); 2882 error = xfs_bwrite(mp, bp);
2971 } 2883 else
2884 xfs_bdwrite(mp, bp);
2972 return error; 2885 return error;
2973 2886
2974corrupt_out: 2887corrupt_out:
@@ -3003,16 +2916,6 @@ xfs_iflush_int(
3003 iip = ip->i_itemp; 2916 iip = ip->i_itemp;
3004 mp = ip->i_mount; 2917 mp = ip->i_mount;
3005 2918
3006
3007 /*
3008 * If the inode isn't dirty, then just release the inode
3009 * flush lock and do nothing.
3010 */
3011 if (xfs_inode_clean(ip)) {
3012 xfs_ifunlock(ip);
3013 return 0;
3014 }
3015
3016 /* set *dip = inode's place in the buffer */ 2919 /* set *dip = inode's place in the buffer */
3017 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 2920 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
3018 2921
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ec1f28c4fc4f..9965e40a4615 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -420,16 +420,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
420#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) 420#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
421 421
422/* 422/*
423 * Flags for xfs_iflush()
424 */
425#define XFS_IFLUSH_DELWRI_ELSE_SYNC 1
426#define XFS_IFLUSH_DELWRI_ELSE_ASYNC 2
427#define XFS_IFLUSH_SYNC 3
428#define XFS_IFLUSH_ASYNC 4
429#define XFS_IFLUSH_DELWRI 5
430#define XFS_IFLUSH_ASYNC_NOBLOCK 6
431
432/*
433 * Flags for xfs_itruncate_start(). 423 * Flags for xfs_itruncate_start().
434 */ 424 */
435#define XFS_ITRUNC_DEFINITE 0x1 425#define XFS_ITRUNC_DEFINITE 0x1
@@ -481,14 +471,14 @@ int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
481int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); 471int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
482 472
483void xfs_iext_realloc(xfs_inode_t *, int, int); 473void xfs_iext_realloc(xfs_inode_t *, int, int);
484void xfs_ipin(xfs_inode_t *); 474void xfs_iunpin_wait(xfs_inode_t *);
485void xfs_iunpin(xfs_inode_t *);
486int xfs_iflush(xfs_inode_t *, uint); 475int xfs_iflush(xfs_inode_t *, uint);
487void xfs_ichgtime(xfs_inode_t *, int); 476void xfs_ichgtime(xfs_inode_t *, int);
488void xfs_lock_inodes(xfs_inode_t **, int, uint); 477void xfs_lock_inodes(xfs_inode_t **, int, uint);
489void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 478void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
490 479
491void xfs_synchronize_times(xfs_inode_t *); 480void xfs_synchronize_times(xfs_inode_t *);
481void xfs_mark_inode_dirty(xfs_inode_t *);
492void xfs_mark_inode_dirty_sync(xfs_inode_t *); 482void xfs_mark_inode_dirty_sync(xfs_inode_t *);
493 483
494#define IHOLD(ip) \ 484#define IHOLD(ip) \
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index f38855d21ea5..7bfea8540159 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -228,7 +228,7 @@ xfs_inode_item_format(
228 228
229 vecp->i_addr = (xfs_caddr_t)&iip->ili_format; 229 vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
230 vecp->i_len = sizeof(xfs_inode_log_format_t); 230 vecp->i_len = sizeof(xfs_inode_log_format_t);
231 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IFORMAT); 231 vecp->i_type = XLOG_REG_TYPE_IFORMAT;
232 vecp++; 232 vecp++;
233 nvecs = 1; 233 nvecs = 1;
234 234
@@ -279,7 +279,7 @@ xfs_inode_item_format(
279 279
280 vecp->i_addr = (xfs_caddr_t)&ip->i_d; 280 vecp->i_addr = (xfs_caddr_t)&ip->i_d;
281 vecp->i_len = sizeof(struct xfs_icdinode); 281 vecp->i_len = sizeof(struct xfs_icdinode);
282 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE); 282 vecp->i_type = XLOG_REG_TYPE_ICORE;
283 vecp++; 283 vecp++;
284 nvecs++; 284 nvecs++;
285 iip->ili_format.ilf_fields |= XFS_ILOG_CORE; 285 iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
@@ -336,7 +336,7 @@ xfs_inode_item_format(
336 vecp->i_addr = 336 vecp->i_addr =
337 (char *)(ip->i_df.if_u1.if_extents); 337 (char *)(ip->i_df.if_u1.if_extents);
338 vecp->i_len = ip->i_df.if_bytes; 338 vecp->i_len = ip->i_df.if_bytes;
339 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT); 339 vecp->i_type = XLOG_REG_TYPE_IEXT;
340 } else 340 } else
341#endif 341#endif
342 { 342 {
@@ -355,7 +355,7 @@ xfs_inode_item_format(
355 vecp->i_addr = (xfs_caddr_t)ext_buffer; 355 vecp->i_addr = (xfs_caddr_t)ext_buffer;
356 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, 356 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
357 XFS_DATA_FORK); 357 XFS_DATA_FORK);
358 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT); 358 vecp->i_type = XLOG_REG_TYPE_IEXT;
359 } 359 }
360 ASSERT(vecp->i_len <= ip->i_df.if_bytes); 360 ASSERT(vecp->i_len <= ip->i_df.if_bytes);
361 iip->ili_format.ilf_dsize = vecp->i_len; 361 iip->ili_format.ilf_dsize = vecp->i_len;
@@ -373,7 +373,7 @@ xfs_inode_item_format(
373 ASSERT(ip->i_df.if_broot != NULL); 373 ASSERT(ip->i_df.if_broot != NULL);
374 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot; 374 vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
375 vecp->i_len = ip->i_df.if_broot_bytes; 375 vecp->i_len = ip->i_df.if_broot_bytes;
376 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IBROOT); 376 vecp->i_type = XLOG_REG_TYPE_IBROOT;
377 vecp++; 377 vecp++;
378 nvecs++; 378 nvecs++;
379 iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; 379 iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
@@ -399,7 +399,7 @@ xfs_inode_item_format(
399 ASSERT((ip->i_df.if_real_bytes == 0) || 399 ASSERT((ip->i_df.if_real_bytes == 0) ||
400 (ip->i_df.if_real_bytes == data_bytes)); 400 (ip->i_df.if_real_bytes == data_bytes));
401 vecp->i_len = (int)data_bytes; 401 vecp->i_len = (int)data_bytes;
402 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ILOCAL); 402 vecp->i_type = XLOG_REG_TYPE_ILOCAL;
403 vecp++; 403 vecp++;
404 nvecs++; 404 nvecs++;
405 iip->ili_format.ilf_dsize = (unsigned)data_bytes; 405 iip->ili_format.ilf_dsize = (unsigned)data_bytes;
@@ -477,7 +477,7 @@ xfs_inode_item_format(
477 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, 477 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
478 XFS_ATTR_FORK); 478 XFS_ATTR_FORK);
479#endif 479#endif
480 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_EXT); 480 vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
481 iip->ili_format.ilf_asize = vecp->i_len; 481 iip->ili_format.ilf_asize = vecp->i_len;
482 vecp++; 482 vecp++;
483 nvecs++; 483 nvecs++;
@@ -492,7 +492,7 @@ xfs_inode_item_format(
492 ASSERT(ip->i_afp->if_broot != NULL); 492 ASSERT(ip->i_afp->if_broot != NULL);
493 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot; 493 vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
494 vecp->i_len = ip->i_afp->if_broot_bytes; 494 vecp->i_len = ip->i_afp->if_broot_bytes;
495 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_BROOT); 495 vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
496 vecp++; 496 vecp++;
497 nvecs++; 497 nvecs++;
498 iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; 498 iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
@@ -516,7 +516,7 @@ xfs_inode_item_format(
516 ASSERT((ip->i_afp->if_real_bytes == 0) || 516 ASSERT((ip->i_afp->if_real_bytes == 0) ||
517 (ip->i_afp->if_real_bytes == data_bytes)); 517 (ip->i_afp->if_real_bytes == data_bytes));
518 vecp->i_len = (int)data_bytes; 518 vecp->i_len = (int)data_bytes;
519 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_LOCAL); 519 vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL;
520 vecp++; 520 vecp++;
521 nvecs++; 521 nvecs++;
522 iip->ili_format.ilf_asize = (unsigned)data_bytes; 522 iip->ili_format.ilf_asize = (unsigned)data_bytes;
@@ -535,23 +535,23 @@ xfs_inode_item_format(
535 535
536/* 536/*
537 * This is called to pin the inode associated with the inode log 537 * This is called to pin the inode associated with the inode log
538 * item in memory so it cannot be written out. Do this by calling 538 * item in memory so it cannot be written out.
539 * xfs_ipin() to bump the pin count in the inode while holding the
540 * inode pin lock.
541 */ 539 */
542STATIC void 540STATIC void
543xfs_inode_item_pin( 541xfs_inode_item_pin(
544 xfs_inode_log_item_t *iip) 542 xfs_inode_log_item_t *iip)
545{ 543{
546 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); 544 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
547 xfs_ipin(iip->ili_inode); 545
546 atomic_inc(&iip->ili_inode->i_pincount);
548} 547}
549 548
550 549
551/* 550/*
552 * This is called to unpin the inode associated with the inode log 551 * This is called to unpin the inode associated with the inode log
553 * item which was previously pinned with a call to xfs_inode_item_pin(). 552 * item which was previously pinned with a call to xfs_inode_item_pin().
554 * Just call xfs_iunpin() on the inode to do this. 553 *
554 * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
555 */ 555 */
556/* ARGSUSED */ 556/* ARGSUSED */
557STATIC void 557STATIC void
@@ -559,7 +559,11 @@ xfs_inode_item_unpin(
559 xfs_inode_log_item_t *iip, 559 xfs_inode_log_item_t *iip,
560 int stale) 560 int stale)
561{ 561{
562 xfs_iunpin(iip->ili_inode); 562 struct xfs_inode *ip = iip->ili_inode;
563
564 ASSERT(atomic_read(&ip->i_pincount) > 0);
565 if (atomic_dec_and_test(&ip->i_pincount))
566 wake_up(&ip->i_ipin_wait);
563} 567}
564 568
565/* ARGSUSED */ 569/* ARGSUSED */
@@ -568,7 +572,7 @@ xfs_inode_item_unpin_remove(
568 xfs_inode_log_item_t *iip, 572 xfs_inode_log_item_t *iip,
569 xfs_trans_t *tp) 573 xfs_trans_t *tp)
570{ 574{
571 xfs_iunpin(iip->ili_inode); 575 xfs_inode_item_unpin(iip, 0);
572} 576}
573 577
574/* 578/*
@@ -602,33 +606,20 @@ xfs_inode_item_trylock(
602 606
603 if (!xfs_iflock_nowait(ip)) { 607 if (!xfs_iflock_nowait(ip)) {
604 /* 608 /*
605 * If someone else isn't already trying to push the inode 609 * inode has already been flushed to the backing buffer,
606 * buffer, we get to do it. 610 * leave it locked in shared mode, pushbuf routine will
611 * unlock it.
607 */ 612 */
608 if (iip->ili_pushbuf_flag == 0) { 613 return XFS_ITEM_PUSHBUF;
609 iip->ili_pushbuf_flag = 1;
610#ifdef DEBUG
611 iip->ili_push_owner = current_pid();
612#endif
613 /*
614 * Inode is left locked in shared mode.
615 * Pushbuf routine gets to unlock it.
616 */
617 return XFS_ITEM_PUSHBUF;
618 } else {
619 /*
620 * We hold the AIL lock, so we must specify the
621 * NONOTIFY flag so that we won't double trip.
622 */
623 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
624 return XFS_ITEM_FLUSHING;
625 }
626 /* NOTREACHED */
627 } 614 }
628 615
629 /* Stale items should force out the iclog */ 616 /* Stale items should force out the iclog */
630 if (ip->i_flags & XFS_ISTALE) { 617 if (ip->i_flags & XFS_ISTALE) {
631 xfs_ifunlock(ip); 618 xfs_ifunlock(ip);
619 /*
620 * we hold the AIL lock - notify the unlock routine of this
621 * so it doesn't try to get the lock again.
622 */
632 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); 623 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
633 return XFS_ITEM_PINNED; 624 return XFS_ITEM_PINNED;
634 } 625 }
@@ -746,11 +737,8 @@ xfs_inode_item_committed(
746 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK 737 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
747 * failed to get the inode flush lock but did get the inode locked SHARED. 738 * failed to get the inode flush lock but did get the inode locked SHARED.
748 * Here we're trying to see if the inode buffer is incore, and if so whether it's 739 * Here we're trying to see if the inode buffer is incore, and if so whether it's
749 * marked delayed write. If that's the case, we'll initiate a bawrite on that 740 * marked delayed write. If that's the case, we'll promote it and that will
750 * buffer to expedite the process. 741 * allow the caller to write the buffer by triggering the xfsbufd to run.
751 *
752 * We aren't holding the AIL lock (or the flush lock) when this gets called,
753 * so it is inherently race-y.
754 */ 742 */
755STATIC void 743STATIC void
756xfs_inode_item_pushbuf( 744xfs_inode_item_pushbuf(
@@ -759,82 +747,30 @@ xfs_inode_item_pushbuf(
759 xfs_inode_t *ip; 747 xfs_inode_t *ip;
760 xfs_mount_t *mp; 748 xfs_mount_t *mp;
761 xfs_buf_t *bp; 749 xfs_buf_t *bp;
762 uint dopush;
763 750
764 ip = iip->ili_inode; 751 ip = iip->ili_inode;
765
766 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); 752 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
767 753
768 /* 754 /*
769 * The ili_pushbuf_flag keeps others from
770 * trying to duplicate our effort.
771 */
772 ASSERT(iip->ili_pushbuf_flag != 0);
773 ASSERT(iip->ili_push_owner == current_pid());
774
775 /*
776 * If a flush is not in progress anymore, chances are that the 755 * If a flush is not in progress anymore, chances are that the
777 * inode was taken off the AIL. So, just get out. 756 * inode was taken off the AIL. So, just get out.
778 */ 757 */
779 if (completion_done(&ip->i_flush) || 758 if (completion_done(&ip->i_flush) ||
780 ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { 759 ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
781 iip->ili_pushbuf_flag = 0;
782 xfs_iunlock(ip, XFS_ILOCK_SHARED); 760 xfs_iunlock(ip, XFS_ILOCK_SHARED);
783 return; 761 return;
784 } 762 }
785 763
786 mp = ip->i_mount; 764 mp = ip->i_mount;
787 bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, 765 bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
788 iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK); 766 iip->ili_format.ilf_len, XBF_TRYLOCK);
789
790 if (bp != NULL) {
791 if (XFS_BUF_ISDELAYWRITE(bp)) {
792 /*
793 * We were racing with iflush because we don't hold
794 * the AIL lock or the flush lock. However, at this point,
795 * we have the buffer, and we know that it's dirty.
796 * So, it's possible that iflush raced with us, and
797 * this item is already taken off the AIL.
798 * If not, we can flush it async.
799 */
800 dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
801 !completion_done(&ip->i_flush));
802 iip->ili_pushbuf_flag = 0;
803 xfs_iunlock(ip, XFS_ILOCK_SHARED);
804
805 trace_xfs_inode_item_push(bp, _RET_IP_);
806 767
807 if (XFS_BUF_ISPINNED(bp)) {
808 xfs_log_force(mp, (xfs_lsn_t)0,
809 XFS_LOG_FORCE);
810 }
811 if (dopush) {
812 int error;
813 error = xfs_bawrite(mp, bp);
814 if (error)
815 xfs_fs_cmn_err(CE_WARN, mp,
816 "xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
817 error, iip, bp);
818 } else {
819 xfs_buf_relse(bp);
820 }
821 } else {
822 iip->ili_pushbuf_flag = 0;
823 xfs_iunlock(ip, XFS_ILOCK_SHARED);
824 xfs_buf_relse(bp);
825 }
826 return;
827 }
828 /*
829 * We have to be careful about resetting pushbuf flag too early (above).
830 * Even though in theory we can do it as soon as we have the buflock,
831 * we don't want others to be doing work needlessly. They'll come to
832 * this function thinking that pushing the buffer is their
833 * responsibility only to find that the buffer is still locked by
834 * another doing the same thing
835 */
836 iip->ili_pushbuf_flag = 0;
837 xfs_iunlock(ip, XFS_ILOCK_SHARED); 768 xfs_iunlock(ip, XFS_ILOCK_SHARED);
769 if (!bp)
770 return;
771 if (XFS_BUF_ISDELAYWRITE(bp))
772 xfs_buf_delwri_promote(bp);
773 xfs_buf_relse(bp);
838 return; 774 return;
839} 775}
840 776
@@ -867,10 +803,14 @@ xfs_inode_item_push(
867 iip->ili_format.ilf_fields != 0); 803 iip->ili_format.ilf_fields != 0);
868 804
869 /* 805 /*
870 * Write out the inode. The completion routine ('iflush_done') will 806 * Push the inode to it's backing buffer. This will not remove the
871 * pull it from the AIL, mark it clean, unlock the flush lock. 807 * inode from the AIL - a further push will be required to trigger a
808 * buffer push. However, this allows all the dirty inodes to be pushed
809 * to the buffer before it is pushed to disk. THe buffer IO completion
810 * will pull th einode from the AIL, mark it clean and unlock the flush
811 * lock.
872 */ 812 */
873 (void) xfs_iflush(ip, XFS_IFLUSH_ASYNC); 813 (void) xfs_iflush(ip, 0);
874 xfs_iunlock(ip, XFS_ILOCK_SHARED); 814 xfs_iunlock(ip, XFS_ILOCK_SHARED);
875 815
876 return; 816 return;
@@ -934,7 +874,6 @@ xfs_inode_item_init(
934 /* 874 /*
935 We have zeroed memory. No need ... 875 We have zeroed memory. No need ...
936 iip->ili_extents_buf = NULL; 876 iip->ili_extents_buf = NULL;
937 iip->ili_pushbuf_flag = 0;
938 */ 877 */
939 878
940 iip->ili_format.ilf_type = XFS_LI_INODE; 879 iip->ili_format.ilf_type = XFS_LI_INODE;
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index cc8df1ac7783..9a467958ecdd 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -144,12 +144,6 @@ typedef struct xfs_inode_log_item {
144 data exts */ 144 data exts */
145 struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged 145 struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged
146 attr exts */ 146 attr exts */
147 unsigned int ili_pushbuf_flag; /* one bit used in push_ail */
148
149#ifdef DEBUG
150 uint64_t ili_push_owner; /* one who sets pushbuf_flag
151 above gets to push the buf */
152#endif
153#ifdef XFS_TRANS_DEBUG 147#ifdef XFS_TRANS_DEBUG
154 int ili_root_size; 148 int ili_root_size;
155 char *ili_orig_root; 149 char *ili_orig_root;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 62efab2f3839..b1b801e4a28e 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -106,6 +106,7 @@ xfs_bulkstat_one_iget(
106 buf->bs_dmevmask = dic->di_dmevmask; 106 buf->bs_dmevmask = dic->di_dmevmask;
107 buf->bs_dmstate = dic->di_dmstate; 107 buf->bs_dmstate = dic->di_dmstate;
108 buf->bs_aextents = dic->di_anextents; 108 buf->bs_aextents = dic->di_anextents;
109 buf->bs_forkoff = XFS_IFORK_BOFF(ip);
109 110
110 switch (dic->di_format) { 111 switch (dic->di_format) {
111 case XFS_DINODE_FMT_DEV: 112 case XFS_DINODE_FMT_DEV:
@@ -176,6 +177,7 @@ xfs_bulkstat_one_dinode(
176 buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask); 177 buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask);
177 buf->bs_dmstate = be16_to_cpu(dic->di_dmstate); 178 buf->bs_dmstate = be16_to_cpu(dic->di_dmstate);
178 buf->bs_aextents = be16_to_cpu(dic->di_anextents); 179 buf->bs_aextents = be16_to_cpu(dic->di_anextents);
180 buf->bs_forkoff = XFS_DFORK_BOFF(dic);
179 181
180 switch (dic->di_format) { 182 switch (dic->di_format) {
181 case XFS_DINODE_FMT_DEV: 183 case XFS_DINODE_FMT_DEV:
@@ -408,8 +410,10 @@ xfs_bulkstat(
408 (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog); 410 (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
409 nimask = ~(nicluster - 1); 411 nimask = ~(nicluster - 1);
410 nbcluster = nicluster >> mp->m_sb.sb_inopblog; 412 nbcluster = nicluster >> mp->m_sb.sb_inopblog;
411 irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4, 413 irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
412 KM_SLEEP | KM_MAYFAIL | KM_LARGE); 414 if (!irbuf)
415 return ENOMEM;
416
413 nirbuf = irbsize / sizeof(*irbuf); 417 nirbuf = irbsize / sizeof(*irbuf);
414 418
415 /* 419 /*
@@ -420,9 +424,7 @@ xfs_bulkstat(
420 while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { 424 while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
421 cond_resched(); 425 cond_resched();
422 bp = NULL; 426 bp = NULL;
423 down_read(&mp->m_peraglock);
424 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); 427 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
425 up_read(&mp->m_peraglock);
426 if (error) { 428 if (error) {
427 /* 429 /*
428 * Skip this allocation group and go to the next one. 430 * Skip this allocation group and go to the next one.
@@ -729,7 +731,7 @@ xfs_bulkstat(
729 /* 731 /*
730 * Done, we're either out of filesystem or space to put the data. 732 * Done, we're either out of filesystem or space to put the data.
731 */ 733 */
732 kmem_free(irbuf); 734 kmem_free_large(irbuf);
733 *ubcountp = ubelem; 735 *ubcountp = ubelem;
734 /* 736 /*
735 * Found some inodes, return them now and return the error next time. 737 * Found some inodes, return them now and return the error next time.
@@ -849,9 +851,7 @@ xfs_inumbers(
849 agbp = NULL; 851 agbp = NULL;
850 while (left > 0 && agno < mp->m_sb.sb_agcount) { 852 while (left > 0 && agno < mp->m_sb.sb_agcount) {
851 if (agbp == NULL) { 853 if (agbp == NULL) {
852 down_read(&mp->m_peraglock);
853 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); 854 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
854 up_read(&mp->m_peraglock);
855 if (error) { 855 if (error) {
856 /* 856 /*
857 * If we can't read the AGI of this ag, 857 * If we can't read the AGI of this ag,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 600b5b06aaeb..2be019136287 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -50,7 +50,6 @@ kmem_zone_t *xfs_log_ticket_zone;
50 (off) += (bytes);} 50 (off) += (bytes);}
51 51
52/* Local miscellaneous function prototypes */ 52/* Local miscellaneous function prototypes */
53STATIC int xlog_bdstrat_cb(struct xfs_buf *);
54STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket, 53STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket,
55 xlog_in_core_t **, xfs_lsn_t *); 54 xlog_in_core_t **, xfs_lsn_t *);
56STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, 55STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
@@ -61,7 +60,7 @@ STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
61STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 60STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
62STATIC void xlog_dealloc_log(xlog_t *log); 61STATIC void xlog_dealloc_log(xlog_t *log);
63STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[], 62STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
64 int nentries, xfs_log_ticket_t tic, 63 int nentries, struct xlog_ticket *tic,
65 xfs_lsn_t *start_lsn, 64 xfs_lsn_t *start_lsn,
66 xlog_in_core_t **commit_iclog, 65 xlog_in_core_t **commit_iclog,
67 uint flags); 66 uint flags);
@@ -80,11 +79,6 @@ STATIC int xlog_state_release_iclog(xlog_t *log,
80STATIC void xlog_state_switch_iclogs(xlog_t *log, 79STATIC void xlog_state_switch_iclogs(xlog_t *log,
81 xlog_in_core_t *iclog, 80 xlog_in_core_t *iclog,
82 int eventual_size); 81 int eventual_size);
83STATIC int xlog_state_sync(xlog_t *log,
84 xfs_lsn_t lsn,
85 uint flags,
86 int *log_flushed);
87STATIC int xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed);
88STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); 82STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
89 83
90/* local functions to manipulate grant head */ 84/* local functions to manipulate grant head */
@@ -249,14 +243,14 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
249 * out when the next write occurs. 243 * out when the next write occurs.
250 */ 244 */
251xfs_lsn_t 245xfs_lsn_t
252xfs_log_done(xfs_mount_t *mp, 246xfs_log_done(
253 xfs_log_ticket_t xtic, 247 struct xfs_mount *mp,
254 void **iclog, 248 struct xlog_ticket *ticket,
255 uint flags) 249 struct xlog_in_core **iclog,
250 uint flags)
256{ 251{
257 xlog_t *log = mp->m_log; 252 struct log *log = mp->m_log;
258 xlog_ticket_t *ticket = (xfs_log_ticket_t) xtic; 253 xfs_lsn_t lsn = 0;
259 xfs_lsn_t lsn = 0;
260 254
261 if (XLOG_FORCED_SHUTDOWN(log) || 255 if (XLOG_FORCED_SHUTDOWN(log) ||
262 /* 256 /*
@@ -264,8 +258,7 @@ xfs_log_done(xfs_mount_t *mp,
264 * If we get an error, just continue and give back the log ticket. 258 * If we get an error, just continue and give back the log ticket.
265 */ 259 */
266 (((ticket->t_flags & XLOG_TIC_INITED) == 0) && 260 (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
267 (xlog_commit_record(mp, ticket, 261 (xlog_commit_record(mp, ticket, iclog, &lsn)))) {
268 (xlog_in_core_t **)iclog, &lsn)))) {
269 lsn = (xfs_lsn_t) -1; 262 lsn = (xfs_lsn_t) -1;
270 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { 263 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
271 flags |= XFS_LOG_REL_PERM_RESERV; 264 flags |= XFS_LOG_REL_PERM_RESERV;
@@ -295,67 +288,8 @@ xfs_log_done(xfs_mount_t *mp,
295 } 288 }
296 289
297 return lsn; 290 return lsn;
298} /* xfs_log_done */
299
300
301/*
302 * Force the in-core log to disk. If flags == XFS_LOG_SYNC,
303 * the force is done synchronously.
304 *
305 * Asynchronous forces are implemented by setting the WANT_SYNC
306 * bit in the appropriate in-core log and then returning.
307 *
308 * Synchronous forces are implemented with a signal variable. All callers
309 * to force a given lsn to disk will wait on a the sv attached to the
310 * specific in-core log. When given in-core log finally completes its
311 * write to disk, that thread will wake up all threads waiting on the
312 * sv.
313 */
314int
315_xfs_log_force(
316 xfs_mount_t *mp,
317 xfs_lsn_t lsn,
318 uint flags,
319 int *log_flushed)
320{
321 xlog_t *log = mp->m_log;
322 int dummy;
323
324 if (!log_flushed)
325 log_flushed = &dummy;
326
327 ASSERT(flags & XFS_LOG_FORCE);
328
329 XFS_STATS_INC(xs_log_force);
330
331 if (log->l_flags & XLOG_IO_ERROR)
332 return XFS_ERROR(EIO);
333 if (lsn == 0)
334 return xlog_state_sync_all(log, flags, log_flushed);
335 else
336 return xlog_state_sync(log, lsn, flags, log_flushed);
337} /* _xfs_log_force */
338
339/*
340 * Wrapper for _xfs_log_force(), to be used when caller doesn't care
341 * about errors or whether the log was flushed or not. This is the normal
342 * interface to use when trying to unpin items or move the log forward.
343 */
344void
345xfs_log_force(
346 xfs_mount_t *mp,
347 xfs_lsn_t lsn,
348 uint flags)
349{
350 int error;
351 error = _xfs_log_force(mp, lsn, flags, NULL);
352 if (error) {
353 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
354 "error %d returned.", error);
355 }
356} 291}
357 292
358
359/* 293/*
360 * Attaches a new iclog I/O completion callback routine during 294 * Attaches a new iclog I/O completion callback routine during
361 * transaction commit. If the log is in error state, a non-zero 295 * transaction commit. If the log is in error state, a non-zero
@@ -363,11 +297,11 @@ xfs_log_force(
363 * executing the callback at an appropriate time. 297 * executing the callback at an appropriate time.
364 */ 298 */
365int 299int
366xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ 300xfs_log_notify(
367 void *iclog_hndl, /* iclog to hang callback off */ 301 struct xfs_mount *mp,
368 xfs_log_callback_t *cb) 302 struct xlog_in_core *iclog,
303 xfs_log_callback_t *cb)
369{ 304{
370 xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl;
371 int abortflg; 305 int abortflg;
372 306
373 spin_lock(&iclog->ic_callback_lock); 307 spin_lock(&iclog->ic_callback_lock);
@@ -381,16 +315,14 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */
381 } 315 }
382 spin_unlock(&iclog->ic_callback_lock); 316 spin_unlock(&iclog->ic_callback_lock);
383 return abortflg; 317 return abortflg;
384} /* xfs_log_notify */ 318}
385 319
386int 320int
387xfs_log_release_iclog(xfs_mount_t *mp, 321xfs_log_release_iclog(
388 void *iclog_hndl) 322 struct xfs_mount *mp,
323 struct xlog_in_core *iclog)
389{ 324{
390 xlog_t *log = mp->m_log; 325 if (xlog_state_release_iclog(mp->m_log, iclog)) {
391 xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl;
392
393 if (xlog_state_release_iclog(log, iclog)) {
394 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 326 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
395 return EIO; 327 return EIO;
396 } 328 }
@@ -409,17 +341,18 @@ xfs_log_release_iclog(xfs_mount_t *mp,
409 * reservation, we prevent over allocation problems. 341 * reservation, we prevent over allocation problems.
410 */ 342 */
411int 343int
412xfs_log_reserve(xfs_mount_t *mp, 344xfs_log_reserve(
413 int unit_bytes, 345 struct xfs_mount *mp,
414 int cnt, 346 int unit_bytes,
415 xfs_log_ticket_t *ticket, 347 int cnt,
416 __uint8_t client, 348 struct xlog_ticket **ticket,
417 uint flags, 349 __uint8_t client,
418 uint t_type) 350 uint flags,
351 uint t_type)
419{ 352{
420 xlog_t *log = mp->m_log; 353 struct log *log = mp->m_log;
421 xlog_ticket_t *internal_ticket; 354 struct xlog_ticket *internal_ticket;
422 int retval = 0; 355 int retval = 0;
423 356
424 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); 357 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
425 ASSERT((flags & XFS_LOG_NOSLEEP) == 0); 358 ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
@@ -432,7 +365,7 @@ xfs_log_reserve(xfs_mount_t *mp,
432 365
433 if (*ticket != NULL) { 366 if (*ticket != NULL) {
434 ASSERT(flags & XFS_LOG_PERM_RESERV); 367 ASSERT(flags & XFS_LOG_PERM_RESERV);
435 internal_ticket = (xlog_ticket_t *)*ticket; 368 internal_ticket = *ticket;
436 369
437 trace_xfs_log_reserve(log, internal_ticket); 370 trace_xfs_log_reserve(log, internal_ticket);
438 371
@@ -584,7 +517,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
584 xlog_in_core_t *first_iclog; 517 xlog_in_core_t *first_iclog;
585#endif 518#endif
586 xfs_log_iovec_t reg[1]; 519 xfs_log_iovec_t reg[1];
587 xfs_log_ticket_t tic = NULL; 520 xlog_ticket_t *tic = NULL;
588 xfs_lsn_t lsn; 521 xfs_lsn_t lsn;
589 int error; 522 int error;
590 523
@@ -602,7 +535,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
602 if (mp->m_flags & XFS_MOUNT_RDONLY) 535 if (mp->m_flags & XFS_MOUNT_RDONLY)
603 return 0; 536 return 0;
604 537
605 error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL); 538 error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
606 ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); 539 ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
607 540
608#ifdef DEBUG 541#ifdef DEBUG
@@ -618,7 +551,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
618 if (! (XLOG_FORCED_SHUTDOWN(log))) { 551 if (! (XLOG_FORCED_SHUTDOWN(log))) {
619 reg[0].i_addr = (void*)&magic; 552 reg[0].i_addr = (void*)&magic;
620 reg[0].i_len = sizeof(magic); 553 reg[0].i_len = sizeof(magic);
621 XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_UNMOUNT); 554 reg[0].i_type = XLOG_REG_TYPE_UNMOUNT;
622 555
623 error = xfs_log_reserve(mp, 600, 1, &tic, 556 error = xfs_log_reserve(mp, 600, 1, &tic,
624 XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); 557 XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
@@ -721,24 +654,24 @@ xfs_log_unmount(xfs_mount_t *mp)
721 * transaction occur with one call to xfs_log_write(). 654 * transaction occur with one call to xfs_log_write().
722 */ 655 */
723int 656int
724xfs_log_write(xfs_mount_t * mp, 657xfs_log_write(
725 xfs_log_iovec_t reg[], 658 struct xfs_mount *mp,
726 int nentries, 659 struct xfs_log_iovec reg[],
727 xfs_log_ticket_t tic, 660 int nentries,
728 xfs_lsn_t *start_lsn) 661 struct xlog_ticket *tic,
662 xfs_lsn_t *start_lsn)
729{ 663{
730 int error; 664 struct log *log = mp->m_log;
731 xlog_t *log = mp->m_log; 665 int error;
732 666
733 if (XLOG_FORCED_SHUTDOWN(log)) 667 if (XLOG_FORCED_SHUTDOWN(log))
734 return XFS_ERROR(EIO); 668 return XFS_ERROR(EIO);
735 669
736 if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) { 670 error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0);
671 if (error)
737 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 672 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
738 }
739 return error; 673 return error;
740} /* xfs_log_write */ 674}
741
742 675
743void 676void
744xfs_log_move_tail(xfs_mount_t *mp, 677xfs_log_move_tail(xfs_mount_t *mp,
@@ -812,9 +745,16 @@ xfs_log_move_tail(xfs_mount_t *mp,
812 745
813/* 746/*
814 * Determine if we have a transaction that has gone to disk 747 * Determine if we have a transaction that has gone to disk
815 * that needs to be covered. Log activity needs to be idle (no AIL and 748 * that needs to be covered. To begin the transition to the idle state
816 * nothing in the iclogs). And, we need to be in the right state indicating 749 * firstly the log needs to be idle (no AIL and nothing in the iclogs).
817 * something has gone out. 750 * If we are then in a state where covering is needed, the caller is informed
751 * that dummy transactions are required to move the log into the idle state.
752 *
753 * Because this is called as part of the sync process, we should also indicate
754 * that dummy transactions should be issued in anything but the covered or
755 * idle states. This ensures that the log tail is accurately reflected in
756 * the log at the end of the sync, hence if a crash occurrs avoids replay
757 * of transactions where the metadata is already on disk.
818 */ 758 */
819int 759int
820xfs_log_need_covered(xfs_mount_t *mp) 760xfs_log_need_covered(xfs_mount_t *mp)
@@ -826,17 +766,24 @@ xfs_log_need_covered(xfs_mount_t *mp)
826 return 0; 766 return 0;
827 767
828 spin_lock(&log->l_icloglock); 768 spin_lock(&log->l_icloglock);
829 if (((log->l_covered_state == XLOG_STATE_COVER_NEED) || 769 switch (log->l_covered_state) {
830 (log->l_covered_state == XLOG_STATE_COVER_NEED2)) 770 case XLOG_STATE_COVER_DONE:
831 && !xfs_trans_ail_tail(log->l_ailp) 771 case XLOG_STATE_COVER_DONE2:
832 && xlog_iclogs_empty(log)) { 772 case XLOG_STATE_COVER_IDLE:
833 if (log->l_covered_state == XLOG_STATE_COVER_NEED) 773 break;
834 log->l_covered_state = XLOG_STATE_COVER_DONE; 774 case XLOG_STATE_COVER_NEED:
835 else { 775 case XLOG_STATE_COVER_NEED2:
836 ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2); 776 if (!xfs_trans_ail_tail(log->l_ailp) &&
837 log->l_covered_state = XLOG_STATE_COVER_DONE2; 777 xlog_iclogs_empty(log)) {
778 if (log->l_covered_state == XLOG_STATE_COVER_NEED)
779 log->l_covered_state = XLOG_STATE_COVER_DONE;
780 else
781 log->l_covered_state = XLOG_STATE_COVER_DONE2;
838 } 782 }
783 /* FALLTHRU */
784 default:
839 needed = 1; 785 needed = 1;
786 break;
840 } 787 }
841 spin_unlock(&log->l_icloglock); 788 spin_unlock(&log->l_icloglock);
842 return needed; 789 return needed;
@@ -988,35 +935,6 @@ xlog_iodone(xfs_buf_t *bp)
988} /* xlog_iodone */ 935} /* xlog_iodone */
989 936
990/* 937/*
991 * The bdstrat callback function for log bufs. This gives us a central
992 * place to trap bufs in case we get hit by a log I/O error and need to
993 * shutdown. Actually, in practice, even when we didn't get a log error,
994 * we transition the iclogs to IOERROR state *after* flushing all existing
995 * iclogs to disk. This is because we don't want anymore new transactions to be
996 * started or completed afterwards.
997 */
998STATIC int
999xlog_bdstrat_cb(struct xfs_buf *bp)
1000{
1001 xlog_in_core_t *iclog;
1002
1003 iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
1004
1005 if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) {
1006 /* note for irix bstrat will need struct bdevsw passed
1007 * Fix the following macro if the code ever is merged
1008 */
1009 XFS_bdstrat(bp);
1010 return 0;
1011 }
1012
1013 XFS_BUF_ERROR(bp, EIO);
1014 XFS_BUF_STALE(bp);
1015 xfs_biodone(bp);
1016 return XFS_ERROR(EIO);
1017}
1018
1019/*
1020 * Return size of each in-core log record buffer. 938 * Return size of each in-core log record buffer.
1021 * 939 *
1022 * All machines get 8 x 32kB buffers by default, unless tuned otherwise. 940 * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
@@ -1158,7 +1076,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1158 if (!bp) 1076 if (!bp)
1159 goto out_free_log; 1077 goto out_free_log;
1160 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); 1078 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
1161 XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
1162 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); 1079 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
1163 ASSERT(XFS_BUF_ISBUSY(bp)); 1080 ASSERT(XFS_BUF_ISBUSY(bp));
1164 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 1081 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
@@ -1196,7 +1113,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1196 if (!XFS_BUF_CPSEMA(bp)) 1113 if (!XFS_BUF_CPSEMA(bp))
1197 ASSERT(0); 1114 ASSERT(0);
1198 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); 1115 XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
1199 XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
1200 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); 1116 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
1201 iclog->ic_bp = bp; 1117 iclog->ic_bp = bp;
1202 iclog->ic_data = bp->b_addr; 1118 iclog->ic_data = bp->b_addr;
@@ -1268,7 +1184,7 @@ xlog_commit_record(xfs_mount_t *mp,
1268 1184
1269 reg[0].i_addr = NULL; 1185 reg[0].i_addr = NULL;
1270 reg[0].i_len = 0; 1186 reg[0].i_len = 0;
1271 XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_COMMIT); 1187 reg[0].i_type = XLOG_REG_TYPE_COMMIT;
1272 1188
1273 ASSERT_ALWAYS(iclog); 1189 ASSERT_ALWAYS(iclog);
1274 if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp, 1190 if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
@@ -1343,6 +1259,37 @@ xlog_grant_push_ail(xfs_mount_t *mp,
1343 xfs_trans_ail_push(log->l_ailp, threshold_lsn); 1259 xfs_trans_ail_push(log->l_ailp, threshold_lsn);
1344} /* xlog_grant_push_ail */ 1260} /* xlog_grant_push_ail */
1345 1261
1262/*
1263 * The bdstrat callback function for log bufs. This gives us a central
1264 * place to trap bufs in case we get hit by a log I/O error and need to
1265 * shutdown. Actually, in practice, even when we didn't get a log error,
1266 * we transition the iclogs to IOERROR state *after* flushing all existing
1267 * iclogs to disk. This is because we don't want anymore new transactions to be
1268 * started or completed afterwards.
1269 */
1270STATIC int
1271xlog_bdstrat(
1272 struct xfs_buf *bp)
1273{
1274 struct xlog_in_core *iclog;
1275
1276 iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
1277 if (iclog->ic_state & XLOG_STATE_IOERROR) {
1278 XFS_BUF_ERROR(bp, EIO);
1279 XFS_BUF_STALE(bp);
1280 xfs_biodone(bp);
1281 /*
1282 * It would seem logical to return EIO here, but we rely on
1283 * the log state machine to propagate I/O errors instead of
1284 * doing it here.
1285 */
1286 return 0;
1287 }
1288
1289 bp->b_flags |= _XBF_RUN_QUEUES;
1290 xfs_buf_iorequest(bp);
1291 return 0;
1292}
1346 1293
1347/* 1294/*
1348 * Flush out the in-core log (iclog) to the on-disk log in an asynchronous 1295 * Flush out the in-core log (iclog) to the on-disk log in an asynchronous
@@ -1462,7 +1409,7 @@ xlog_sync(xlog_t *log,
1462 */ 1409 */
1463 XFS_BUF_WRITE(bp); 1410 XFS_BUF_WRITE(bp);
1464 1411
1465 if ((error = XFS_bwrite(bp))) { 1412 if ((error = xlog_bdstrat(bp))) {
1466 xfs_ioerror_alert("xlog_sync", log->l_mp, bp, 1413 xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
1467 XFS_BUF_ADDR(bp)); 1414 XFS_BUF_ADDR(bp));
1468 return error; 1415 return error;
@@ -1502,7 +1449,7 @@ xlog_sync(xlog_t *log,
1502 /* account for internal log which doesn't start at block #0 */ 1449 /* account for internal log which doesn't start at block #0 */
1503 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); 1450 XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
1504 XFS_BUF_WRITE(bp); 1451 XFS_BUF_WRITE(bp);
1505 if ((error = XFS_bwrite(bp))) { 1452 if ((error = xlog_bdstrat(bp))) {
1506 xfs_ioerror_alert("xlog_sync (split)", log->l_mp, 1453 xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
1507 bp, XFS_BUF_ADDR(bp)); 1454 bp, XFS_BUF_ADDR(bp));
1508 return error; 1455 return error;
@@ -1707,16 +1654,16 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1707 * bytes have been written out. 1654 * bytes have been written out.
1708 */ 1655 */
1709STATIC int 1656STATIC int
1710xlog_write(xfs_mount_t * mp, 1657xlog_write(
1711 xfs_log_iovec_t reg[], 1658 struct xfs_mount *mp,
1712 int nentries, 1659 struct xfs_log_iovec reg[],
1713 xfs_log_ticket_t tic, 1660 int nentries,
1714 xfs_lsn_t *start_lsn, 1661 struct xlog_ticket *ticket,
1715 xlog_in_core_t **commit_iclog, 1662 xfs_lsn_t *start_lsn,
1716 uint flags) 1663 struct xlog_in_core **commit_iclog,
1664 uint flags)
1717{ 1665{
1718 xlog_t *log = mp->m_log; 1666 xlog_t *log = mp->m_log;
1719 xlog_ticket_t *ticket = (xlog_ticket_t *)tic;
1720 xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */ 1667 xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */
1721 xlog_op_header_t *logop_head; /* ptr to log operation header */ 1668 xlog_op_header_t *logop_head; /* ptr to log operation header */
1722 __psint_t ptr; /* copy address into data region */ 1669 __psint_t ptr; /* copy address into data region */
@@ -1830,7 +1777,7 @@ xlog_write(xfs_mount_t * mp,
1830 default: 1777 default:
1831 xfs_fs_cmn_err(CE_WARN, mp, 1778 xfs_fs_cmn_err(CE_WARN, mp,
1832 "Bad XFS transaction clientid 0x%x in ticket 0x%p", 1779 "Bad XFS transaction clientid 0x%x in ticket 0x%p",
1833 logop_head->oh_clientid, tic); 1780 logop_head->oh_clientid, ticket);
1834 return XFS_ERROR(EIO); 1781 return XFS_ERROR(EIO);
1835 } 1782 }
1836 1783
@@ -2854,7 +2801,6 @@ xlog_state_switch_iclogs(xlog_t *log,
2854 log->l_iclog = iclog->ic_next; 2801 log->l_iclog = iclog->ic_next;
2855} /* xlog_state_switch_iclogs */ 2802} /* xlog_state_switch_iclogs */
2856 2803
2857
2858/* 2804/*
2859 * Write out all data in the in-core log as of this exact moment in time. 2805 * Write out all data in the in-core log as of this exact moment in time.
2860 * 2806 *
@@ -2882,11 +2828,17 @@ xlog_state_switch_iclogs(xlog_t *log,
2882 * b) when we return from flushing out this iclog, it is still 2828 * b) when we return from flushing out this iclog, it is still
2883 * not in the active nor dirty state. 2829 * not in the active nor dirty state.
2884 */ 2830 */
2885STATIC int 2831int
2886xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) 2832_xfs_log_force(
2833 struct xfs_mount *mp,
2834 uint flags,
2835 int *log_flushed)
2887{ 2836{
2888 xlog_in_core_t *iclog; 2837 struct log *log = mp->m_log;
2889 xfs_lsn_t lsn; 2838 struct xlog_in_core *iclog;
2839 xfs_lsn_t lsn;
2840
2841 XFS_STATS_INC(xs_log_force);
2890 2842
2891 spin_lock(&log->l_icloglock); 2843 spin_lock(&log->l_icloglock);
2892 2844
@@ -2932,7 +2884,9 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
2932 2884
2933 if (xlog_state_release_iclog(log, iclog)) 2885 if (xlog_state_release_iclog(log, iclog))
2934 return XFS_ERROR(EIO); 2886 return XFS_ERROR(EIO);
2935 *log_flushed = 1; 2887
2888 if (log_flushed)
2889 *log_flushed = 1;
2936 spin_lock(&log->l_icloglock); 2890 spin_lock(&log->l_icloglock);
2937 if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn && 2891 if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn &&
2938 iclog->ic_state != XLOG_STATE_DIRTY) 2892 iclog->ic_state != XLOG_STATE_DIRTY)
@@ -2976,19 +2930,37 @@ maybe_sleep:
2976 */ 2930 */
2977 if (iclog->ic_state & XLOG_STATE_IOERROR) 2931 if (iclog->ic_state & XLOG_STATE_IOERROR)
2978 return XFS_ERROR(EIO); 2932 return XFS_ERROR(EIO);
2979 *log_flushed = 1; 2933 if (log_flushed)
2980 2934 *log_flushed = 1;
2981 } else { 2935 } else {
2982 2936
2983no_sleep: 2937no_sleep:
2984 spin_unlock(&log->l_icloglock); 2938 spin_unlock(&log->l_icloglock);
2985 } 2939 }
2986 return 0; 2940 return 0;
2987} /* xlog_state_sync_all */ 2941}
2942
2943/*
2944 * Wrapper for _xfs_log_force(), to be used when caller doesn't care
2945 * about errors or whether the log was flushed or not. This is the normal
2946 * interface to use when trying to unpin items or move the log forward.
2947 */
2948void
2949xfs_log_force(
2950 xfs_mount_t *mp,
2951 uint flags)
2952{
2953 int error;
2988 2954
2955 error = _xfs_log_force(mp, flags, NULL);
2956 if (error) {
2957 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
2958 "error %d returned.", error);
2959 }
2960}
2989 2961
2990/* 2962/*
2991 * Used by code which implements synchronous log forces. 2963 * Force the in-core log to disk for a specific LSN.
2992 * 2964 *
2993 * Find in-core log with lsn. 2965 * Find in-core log with lsn.
2994 * If it is in the DIRTY state, just return. 2966 * If it is in the DIRTY state, just return.
@@ -2996,109 +2968,142 @@ no_sleep:
2996 * state and go to sleep or return. 2968 * state and go to sleep or return.
2997 * If it is in any other state, go to sleep or return. 2969 * If it is in any other state, go to sleep or return.
2998 * 2970 *
2999 * If filesystem activity goes to zero, the iclog will get flushed only by 2971 * Synchronous forces are implemented with a signal variable. All callers
3000 * bdflush(). 2972 * to force a given lsn to disk will wait on a the sv attached to the
2973 * specific in-core log. When given in-core log finally completes its
2974 * write to disk, that thread will wake up all threads waiting on the
2975 * sv.
3001 */ 2976 */
3002STATIC int 2977int
3003xlog_state_sync(xlog_t *log, 2978_xfs_log_force_lsn(
3004 xfs_lsn_t lsn, 2979 struct xfs_mount *mp,
3005 uint flags, 2980 xfs_lsn_t lsn,
3006 int *log_flushed) 2981 uint flags,
2982 int *log_flushed)
3007{ 2983{
3008 xlog_in_core_t *iclog; 2984 struct log *log = mp->m_log;
3009 int already_slept = 0; 2985 struct xlog_in_core *iclog;
2986 int already_slept = 0;
3010 2987
3011try_again: 2988 ASSERT(lsn != 0);
3012 spin_lock(&log->l_icloglock);
3013 iclog = log->l_iclog;
3014 2989
3015 if (iclog->ic_state & XLOG_STATE_IOERROR) { 2990 XFS_STATS_INC(xs_log_force);
3016 spin_unlock(&log->l_icloglock);
3017 return XFS_ERROR(EIO);
3018 }
3019
3020 do {
3021 if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
3022 iclog = iclog->ic_next;
3023 continue;
3024 }
3025 2991
3026 if (iclog->ic_state == XLOG_STATE_DIRTY) { 2992try_again:
2993 spin_lock(&log->l_icloglock);
2994 iclog = log->l_iclog;
2995 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3027 spin_unlock(&log->l_icloglock); 2996 spin_unlock(&log->l_icloglock);
3028 return 0; 2997 return XFS_ERROR(EIO);
3029 } 2998 }
3030 2999
3031 if (iclog->ic_state == XLOG_STATE_ACTIVE) { 3000 do {
3032 /* 3001 if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
3033 * We sleep here if we haven't already slept (e.g. 3002 iclog = iclog->ic_next;
3034 * this is the first time we've looked at the correct 3003 continue;
3035 * iclog buf) and the buffer before us is going to 3004 }
3036 * be sync'ed. The reason for this is that if we 3005
3037 * are doing sync transactions here, by waiting for 3006 if (iclog->ic_state == XLOG_STATE_DIRTY) {
3038 * the previous I/O to complete, we can allow a few 3007 spin_unlock(&log->l_icloglock);
3039 * more transactions into this iclog before we close 3008 return 0;
3040 * it down. 3009 }
3041 * 3010
3042 * Otherwise, we mark the buffer WANT_SYNC, and bump 3011 if (iclog->ic_state == XLOG_STATE_ACTIVE) {
3043 * up the refcnt so we can release the log (which drops 3012 /*
3044 * the ref count). The state switch keeps new transaction 3013 * We sleep here if we haven't already slept (e.g.
3045 * commits from using this buffer. When the current commits 3014 * this is the first time we've looked at the correct
3046 * finish writing into the buffer, the refcount will drop to 3015 * iclog buf) and the buffer before us is going to
3047 * zero and the buffer will go out then. 3016 * be sync'ed. The reason for this is that if we
3048 */ 3017 * are doing sync transactions here, by waiting for
3049 if (!already_slept && 3018 * the previous I/O to complete, we can allow a few
3050 (iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC | 3019 * more transactions into this iclog before we close
3051 XLOG_STATE_SYNCING))) { 3020 * it down.
3052 ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); 3021 *
3053 XFS_STATS_INC(xs_log_force_sleep); 3022 * Otherwise, we mark the buffer WANT_SYNC, and bump
3054 sv_wait(&iclog->ic_prev->ic_write_wait, PSWP, 3023 * up the refcnt so we can release the log (which
3055 &log->l_icloglock, s); 3024 * drops the ref count). The state switch keeps new
3056 *log_flushed = 1; 3025 * transaction commits from using this buffer. When
3057 already_slept = 1; 3026 * the current commits finish writing into the buffer,
3058 goto try_again; 3027 * the refcount will drop to zero and the buffer will
3059 } else { 3028 * go out then.
3029 */
3030 if (!already_slept &&
3031 (iclog->ic_prev->ic_state &
3032 (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
3033 ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
3034
3035 XFS_STATS_INC(xs_log_force_sleep);
3036
3037 sv_wait(&iclog->ic_prev->ic_write_wait,
3038 PSWP, &log->l_icloglock, s);
3039 if (log_flushed)
3040 *log_flushed = 1;
3041 already_slept = 1;
3042 goto try_again;
3043 }
3060 atomic_inc(&iclog->ic_refcnt); 3044 atomic_inc(&iclog->ic_refcnt);
3061 xlog_state_switch_iclogs(log, iclog, 0); 3045 xlog_state_switch_iclogs(log, iclog, 0);
3062 spin_unlock(&log->l_icloglock); 3046 spin_unlock(&log->l_icloglock);
3063 if (xlog_state_release_iclog(log, iclog)) 3047 if (xlog_state_release_iclog(log, iclog))
3064 return XFS_ERROR(EIO); 3048 return XFS_ERROR(EIO);
3065 *log_flushed = 1; 3049 if (log_flushed)
3050 *log_flushed = 1;
3066 spin_lock(&log->l_icloglock); 3051 spin_lock(&log->l_icloglock);
3067 } 3052 }
3068 }
3069 3053
3070 if ((flags & XFS_LOG_SYNC) && /* sleep */ 3054 if ((flags & XFS_LOG_SYNC) && /* sleep */
3071 !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { 3055 !(iclog->ic_state &
3056 (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
3057 /*
3058 * Don't wait on completion if we know that we've
3059 * gotten a log write error.
3060 */
3061 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3062 spin_unlock(&log->l_icloglock);
3063 return XFS_ERROR(EIO);
3064 }
3065 XFS_STATS_INC(xs_log_force_sleep);
3066 sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
3067 /*
3068 * No need to grab the log lock here since we're
3069 * only deciding whether or not to return EIO
3070 * and the memory read should be atomic.
3071 */
3072 if (iclog->ic_state & XLOG_STATE_IOERROR)
3073 return XFS_ERROR(EIO);
3072 3074
3073 /* 3075 if (log_flushed)
3074 * Don't wait on completion if we know that we've 3076 *log_flushed = 1;
3075 * gotten a log write error. 3077 } else { /* just return */
3076 */
3077 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3078 spin_unlock(&log->l_icloglock); 3078 spin_unlock(&log->l_icloglock);
3079 return XFS_ERROR(EIO);
3080 } 3079 }
3081 XFS_STATS_INC(xs_log_force_sleep);
3082 sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
3083 /*
3084 * No need to grab the log lock here since we're
3085 * only deciding whether or not to return EIO
3086 * and the memory read should be atomic.
3087 */
3088 if (iclog->ic_state & XLOG_STATE_IOERROR)
3089 return XFS_ERROR(EIO);
3090 *log_flushed = 1;
3091 } else { /* just return */
3092 spin_unlock(&log->l_icloglock);
3093 }
3094 return 0;
3095 3080
3096 } while (iclog != log->l_iclog); 3081 return 0;
3082 } while (iclog != log->l_iclog);
3097 3083
3098 spin_unlock(&log->l_icloglock); 3084 spin_unlock(&log->l_icloglock);
3099 return 0; 3085 return 0;
3100} /* xlog_state_sync */ 3086}
3101 3087
3088/*
3089 * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care
3090 * about errors or whether the log was flushed or not. This is the normal
3091 * interface to use when trying to unpin items or move the log forward.
3092 */
3093void
3094xfs_log_force_lsn(
3095 xfs_mount_t *mp,
3096 xfs_lsn_t lsn,
3097 uint flags)
3098{
3099 int error;
3100
3101 error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
3102 if (error) {
3103 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
3104 "error %d returned.", error);
3105 }
3106}
3102 3107
3103/* 3108/*
3104 * Called when we want to mark the current iclog as being ready to sync to 3109 * Called when we want to mark the current iclog as being ready to sync to
@@ -3463,7 +3468,6 @@ xfs_log_force_umount(
3463 xlog_ticket_t *tic; 3468 xlog_ticket_t *tic;
3464 xlog_t *log; 3469 xlog_t *log;
3465 int retval; 3470 int retval;
3466 int dummy;
3467 3471
3468 log = mp->m_log; 3472 log = mp->m_log;
3469 3473
@@ -3537,13 +3541,14 @@ xfs_log_force_umount(
3537 } 3541 }
3538 spin_unlock(&log->l_grant_lock); 3542 spin_unlock(&log->l_grant_lock);
3539 3543
3540 if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { 3544 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
3541 ASSERT(!logerror); 3545 ASSERT(!logerror);
3542 /* 3546 /*
3543 * Force the incore logs to disk before shutting the 3547 * Force the incore logs to disk before shutting the
3544 * log down completely. 3548 * log down completely.
3545 */ 3549 */
3546 xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC, &dummy); 3550 _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
3551
3547 spin_lock(&log->l_icloglock); 3552 spin_lock(&log->l_icloglock);
3548 retval = xlog_state_ioerror(log); 3553 retval = xlog_state_ioerror(log);
3549 spin_unlock(&log->l_icloglock); 3554 spin_unlock(&log->l_icloglock);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d0c9baa50b1a..97a24c7795a4 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -70,14 +70,8 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
70 * Flags to xfs_log_force() 70 * Flags to xfs_log_force()
71 * 71 *
72 * XFS_LOG_SYNC: Synchronous force in-core log to disk 72 * XFS_LOG_SYNC: Synchronous force in-core log to disk
73 * XFS_LOG_FORCE: Start in-core log write now.
74 * XFS_LOG_URGE: Start write within some window of time.
75 *
76 * Note: Either XFS_LOG_FORCE or XFS_LOG_URGE must be set.
77 */ 73 */
78#define XFS_LOG_SYNC 0x1 74#define XFS_LOG_SYNC 0x1
79#define XFS_LOG_FORCE 0x2
80#define XFS_LOG_URGE 0x4
81 75
82#endif /* __KERNEL__ */ 76#endif /* __KERNEL__ */
83 77
@@ -110,16 +104,12 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
110#define XLOG_REG_TYPE_TRANSHDR 19 104#define XLOG_REG_TYPE_TRANSHDR 19
111#define XLOG_REG_TYPE_MAX 19 105#define XLOG_REG_TYPE_MAX 19
112 106
113#define XLOG_VEC_SET_TYPE(vecp, t) ((vecp)->i_type = (t))
114
115typedef struct xfs_log_iovec { 107typedef struct xfs_log_iovec {
116 xfs_caddr_t i_addr; /* beginning address of region */ 108 xfs_caddr_t i_addr; /* beginning address of region */
117 int i_len; /* length in bytes of region */ 109 int i_len; /* length in bytes of region */
118 uint i_type; /* type of region */ 110 uint i_type; /* type of region */
119} xfs_log_iovec_t; 111} xfs_log_iovec_t;
120 112
121typedef void* xfs_log_ticket_t;
122
123/* 113/*
124 * Structure used to pass callback function and the function's argument 114 * Structure used to pass callback function and the function's argument
125 * to the log manager. 115 * to the log manager.
@@ -134,18 +124,25 @@ typedef struct xfs_log_callback {
134#ifdef __KERNEL__ 124#ifdef __KERNEL__
135/* Log manager interfaces */ 125/* Log manager interfaces */
136struct xfs_mount; 126struct xfs_mount;
127struct xlog_in_core;
137struct xlog_ticket; 128struct xlog_ticket;
129
138xfs_lsn_t xfs_log_done(struct xfs_mount *mp, 130xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
139 xfs_log_ticket_t ticket, 131 struct xlog_ticket *ticket,
140 void **iclog, 132 struct xlog_in_core **iclog,
141 uint flags); 133 uint flags);
142int _xfs_log_force(struct xfs_mount *mp, 134int _xfs_log_force(struct xfs_mount *mp,
143 xfs_lsn_t lsn,
144 uint flags, 135 uint flags,
145 int *log_forced); 136 int *log_forced);
146void xfs_log_force(struct xfs_mount *mp, 137void xfs_log_force(struct xfs_mount *mp,
147 xfs_lsn_t lsn,
148 uint flags); 138 uint flags);
139int _xfs_log_force_lsn(struct xfs_mount *mp,
140 xfs_lsn_t lsn,
141 uint flags,
142 int *log_forced);
143void xfs_log_force_lsn(struct xfs_mount *mp,
144 xfs_lsn_t lsn,
145 uint flags);
149int xfs_log_mount(struct xfs_mount *mp, 146int xfs_log_mount(struct xfs_mount *mp,
150 struct xfs_buftarg *log_target, 147 struct xfs_buftarg *log_target,
151 xfs_daddr_t start_block, 148 xfs_daddr_t start_block,
@@ -154,21 +151,21 @@ int xfs_log_mount_finish(struct xfs_mount *mp);
154void xfs_log_move_tail(struct xfs_mount *mp, 151void xfs_log_move_tail(struct xfs_mount *mp,
155 xfs_lsn_t tail_lsn); 152 xfs_lsn_t tail_lsn);
156int xfs_log_notify(struct xfs_mount *mp, 153int xfs_log_notify(struct xfs_mount *mp,
157 void *iclog, 154 struct xlog_in_core *iclog,
158 xfs_log_callback_t *callback_entry); 155 xfs_log_callback_t *callback_entry);
159int xfs_log_release_iclog(struct xfs_mount *mp, 156int xfs_log_release_iclog(struct xfs_mount *mp,
160 void *iclog_hndl); 157 struct xlog_in_core *iclog);
161int xfs_log_reserve(struct xfs_mount *mp, 158int xfs_log_reserve(struct xfs_mount *mp,
162 int length, 159 int length,
163 int count, 160 int count,
164 xfs_log_ticket_t *ticket, 161 struct xlog_ticket **ticket,
165 __uint8_t clientid, 162 __uint8_t clientid,
166 uint flags, 163 uint flags,
167 uint t_type); 164 uint t_type);
168int xfs_log_write(struct xfs_mount *mp, 165int xfs_log_write(struct xfs_mount *mp,
169 xfs_log_iovec_t region[], 166 xfs_log_iovec_t region[],
170 int nentries, 167 int nentries,
171 xfs_log_ticket_t ticket, 168 struct xlog_ticket *ticket,
172 xfs_lsn_t *start_lsn); 169 xfs_lsn_t *start_lsn);
173int xfs_log_unmount_write(struct xfs_mount *mp); 170int xfs_log_unmount_write(struct xfs_mount *mp);
174void xfs_log_unmount(struct xfs_mount *mp); 171void xfs_log_unmount(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index d55662db7077..fd02a18facd5 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -443,14 +443,9 @@ typedef struct log {
443 443
444/* common routines */ 444/* common routines */
445extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); 445extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
446extern int xlog_find_tail(xlog_t *log,
447 xfs_daddr_t *head_blk,
448 xfs_daddr_t *tail_blk);
449extern int xlog_recover(xlog_t *log); 446extern int xlog_recover(xlog_t *log);
450extern int xlog_recover_finish(xlog_t *log); 447extern int xlog_recover_finish(xlog_t *log);
451extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 448extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
452extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
453extern void xlog_put_bp(struct xfs_buf *);
454 449
455extern kmem_zone_t *xfs_log_ticket_zone; 450extern kmem_zone_t *xfs_log_ticket_zone;
456 451
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 69ac2e5ef20c..22e6efdc17ea 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -50,8 +50,6 @@
50 50
51STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); 51STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
52STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); 52STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
53STATIC void xlog_recover_insert_item_backq(xlog_recover_item_t **q,
54 xlog_recover_item_t *item);
55#if defined(DEBUG) 53#if defined(DEBUG)
56STATIC void xlog_recover_check_summary(xlog_t *); 54STATIC void xlog_recover_check_summary(xlog_t *);
57#else 55#else
@@ -68,7 +66,7 @@ STATIC void xlog_recover_check_summary(xlog_t *);
68 ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) ) 66 ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
69#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask) 67#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask)
70 68
71xfs_buf_t * 69STATIC xfs_buf_t *
72xlog_get_bp( 70xlog_get_bp(
73 xlog_t *log, 71 xlog_t *log,
74 int nbblks) 72 int nbblks)
@@ -88,7 +86,7 @@ xlog_get_bp(
88 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); 86 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
89} 87}
90 88
91void 89STATIC void
92xlog_put_bp( 90xlog_put_bp(
93 xfs_buf_t *bp) 91 xfs_buf_t *bp)
94{ 92{
@@ -805,7 +803,7 @@ xlog_find_head(
805 * We could speed up search by using current head_blk buffer, but it is not 803 * We could speed up search by using current head_blk buffer, but it is not
806 * available. 804 * available.
807 */ 805 */
808int 806STATIC int
809xlog_find_tail( 807xlog_find_tail(
810 xlog_t *log, 808 xlog_t *log,
811 xfs_daddr_t *head_blk, 809 xfs_daddr_t *head_blk,
@@ -1367,36 +1365,45 @@ xlog_clear_stale_blocks(
1367 1365
1368STATIC xlog_recover_t * 1366STATIC xlog_recover_t *
1369xlog_recover_find_tid( 1367xlog_recover_find_tid(
1370 xlog_recover_t *q, 1368 struct hlist_head *head,
1371 xlog_tid_t tid) 1369 xlog_tid_t tid)
1372{ 1370{
1373 xlog_recover_t *p = q; 1371 xlog_recover_t *trans;
1372 struct hlist_node *n;
1374 1373
1375 while (p != NULL) { 1374 hlist_for_each_entry(trans, n, head, r_list) {
1376 if (p->r_log_tid == tid) 1375 if (trans->r_log_tid == tid)
1377 break; 1376 return trans;
1378 p = p->r_next;
1379 } 1377 }
1380 return p; 1378 return NULL;
1381} 1379}
1382 1380
1383STATIC void 1381STATIC void
1384xlog_recover_put_hashq( 1382xlog_recover_new_tid(
1385 xlog_recover_t **q, 1383 struct hlist_head *head,
1386 xlog_recover_t *trans) 1384 xlog_tid_t tid,
1385 xfs_lsn_t lsn)
1387{ 1386{
1388 trans->r_next = *q; 1387 xlog_recover_t *trans;
1389 *q = trans; 1388
1389 trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1390 trans->r_log_tid = tid;
1391 trans->r_lsn = lsn;
1392 INIT_LIST_HEAD(&trans->r_itemq);
1393
1394 INIT_HLIST_NODE(&trans->r_list);
1395 hlist_add_head(&trans->r_list, head);
1390} 1396}
1391 1397
1392STATIC void 1398STATIC void
1393xlog_recover_add_item( 1399xlog_recover_add_item(
1394 xlog_recover_item_t **itemq) 1400 struct list_head *head)
1395{ 1401{
1396 xlog_recover_item_t *item; 1402 xlog_recover_item_t *item;
1397 1403
1398 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); 1404 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1399 xlog_recover_insert_item_backq(itemq, item); 1405 INIT_LIST_HEAD(&item->ri_list);
1406 list_add_tail(&item->ri_list, head);
1400} 1407}
1401 1408
1402STATIC int 1409STATIC int
@@ -1409,8 +1416,7 @@ xlog_recover_add_to_cont_trans(
1409 xfs_caddr_t ptr, old_ptr; 1416 xfs_caddr_t ptr, old_ptr;
1410 int old_len; 1417 int old_len;
1411 1418
1412 item = trans->r_itemq; 1419 if (list_empty(&trans->r_itemq)) {
1413 if (item == NULL) {
1414 /* finish copying rest of trans header */ 1420 /* finish copying rest of trans header */
1415 xlog_recover_add_item(&trans->r_itemq); 1421 xlog_recover_add_item(&trans->r_itemq);
1416 ptr = (xfs_caddr_t) &trans->r_theader + 1422 ptr = (xfs_caddr_t) &trans->r_theader +
@@ -1418,7 +1424,8 @@ xlog_recover_add_to_cont_trans(
1418 memcpy(ptr, dp, len); /* d, s, l */ 1424 memcpy(ptr, dp, len); /* d, s, l */
1419 return 0; 1425 return 0;
1420 } 1426 }
1421 item = item->ri_prev; 1427 /* take the tail entry */
1428 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1422 1429
1423 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; 1430 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1424 old_len = item->ri_buf[item->ri_cnt-1].i_len; 1431 old_len = item->ri_buf[item->ri_cnt-1].i_len;
@@ -1455,8 +1462,7 @@ xlog_recover_add_to_trans(
1455 1462
1456 if (!len) 1463 if (!len)
1457 return 0; 1464 return 0;
1458 item = trans->r_itemq; 1465 if (list_empty(&trans->r_itemq)) {
1459 if (item == NULL) {
1460 /* we need to catch log corruptions here */ 1466 /* we need to catch log corruptions here */
1461 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { 1467 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1462 xlog_warn("XFS: xlog_recover_add_to_trans: " 1468 xlog_warn("XFS: xlog_recover_add_to_trans: "
@@ -1474,12 +1480,15 @@ xlog_recover_add_to_trans(
1474 memcpy(ptr, dp, len); 1480 memcpy(ptr, dp, len);
1475 in_f = (xfs_inode_log_format_t *)ptr; 1481 in_f = (xfs_inode_log_format_t *)ptr;
1476 1482
1477 if (item->ri_prev->ri_total != 0 && 1483 /* take the tail entry */
1478 item->ri_prev->ri_total == item->ri_prev->ri_cnt) { 1484 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1485 if (item->ri_total != 0 &&
1486 item->ri_total == item->ri_cnt) {
1487 /* tail item is in use, get a new one */
1479 xlog_recover_add_item(&trans->r_itemq); 1488 xlog_recover_add_item(&trans->r_itemq);
1489 item = list_entry(trans->r_itemq.prev,
1490 xlog_recover_item_t, ri_list);
1480 } 1491 }
1481 item = trans->r_itemq;
1482 item = item->ri_prev;
1483 1492
1484 if (item->ri_total == 0) { /* first region to be added */ 1493 if (item->ri_total == 0) { /* first region to be added */
1485 if (in_f->ilf_size == 0 || 1494 if (in_f->ilf_size == 0 ||
@@ -1504,96 +1513,29 @@ xlog_recover_add_to_trans(
1504 return 0; 1513 return 0;
1505} 1514}
1506 1515
1507STATIC void 1516/*
1508xlog_recover_new_tid( 1517 * Sort the log items in the transaction. Cancelled buffers need
1509 xlog_recover_t **q, 1518 * to be put first so they are processed before any items that might
1510 xlog_tid_t tid, 1519 * modify the buffers. If they are cancelled, then the modifications
1511 xfs_lsn_t lsn) 1520 * don't need to be replayed.
1512{ 1521 */
1513 xlog_recover_t *trans;
1514
1515 trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1516 trans->r_log_tid = tid;
1517 trans->r_lsn = lsn;
1518 xlog_recover_put_hashq(q, trans);
1519}
1520
1521STATIC int
1522xlog_recover_unlink_tid(
1523 xlog_recover_t **q,
1524 xlog_recover_t *trans)
1525{
1526 xlog_recover_t *tp;
1527 int found = 0;
1528
1529 ASSERT(trans != NULL);
1530 if (trans == *q) {
1531 *q = (*q)->r_next;
1532 } else {
1533 tp = *q;
1534 while (tp) {
1535 if (tp->r_next == trans) {
1536 found = 1;
1537 break;
1538 }
1539 tp = tp->r_next;
1540 }
1541 if (!found) {
1542 xlog_warn(
1543 "XFS: xlog_recover_unlink_tid: trans not found");
1544 ASSERT(0);
1545 return XFS_ERROR(EIO);
1546 }
1547 tp->r_next = tp->r_next->r_next;
1548 }
1549 return 0;
1550}
1551
1552STATIC void
1553xlog_recover_insert_item_backq(
1554 xlog_recover_item_t **q,
1555 xlog_recover_item_t *item)
1556{
1557 if (*q == NULL) {
1558 item->ri_prev = item->ri_next = item;
1559 *q = item;
1560 } else {
1561 item->ri_next = *q;
1562 item->ri_prev = (*q)->ri_prev;
1563 (*q)->ri_prev = item;
1564 item->ri_prev->ri_next = item;
1565 }
1566}
1567
1568STATIC void
1569xlog_recover_insert_item_frontq(
1570 xlog_recover_item_t **q,
1571 xlog_recover_item_t *item)
1572{
1573 xlog_recover_insert_item_backq(q, item);
1574 *q = item;
1575}
1576
1577STATIC int 1522STATIC int
1578xlog_recover_reorder_trans( 1523xlog_recover_reorder_trans(
1579 xlog_recover_t *trans) 1524 xlog_recover_t *trans)
1580{ 1525{
1581 xlog_recover_item_t *first_item, *itemq, *itemq_next; 1526 xlog_recover_item_t *item, *n;
1582 xfs_buf_log_format_t *buf_f; 1527 LIST_HEAD(sort_list);
1583 ushort flags = 0;
1584 1528
1585 first_item = itemq = trans->r_itemq; 1529 list_splice_init(&trans->r_itemq, &sort_list);
1586 trans->r_itemq = NULL; 1530 list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1587 do { 1531 xfs_buf_log_format_t *buf_f;
1588 itemq_next = itemq->ri_next;
1589 buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr;
1590 1532
1591 switch (ITEM_TYPE(itemq)) { 1533 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
1534
1535 switch (ITEM_TYPE(item)) {
1592 case XFS_LI_BUF: 1536 case XFS_LI_BUF:
1593 flags = buf_f->blf_flags; 1537 if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
1594 if (!(flags & XFS_BLI_CANCEL)) { 1538 list_move(&item->ri_list, &trans->r_itemq);
1595 xlog_recover_insert_item_frontq(&trans->r_itemq,
1596 itemq);
1597 break; 1539 break;
1598 } 1540 }
1599 case XFS_LI_INODE: 1541 case XFS_LI_INODE:
@@ -1601,7 +1543,7 @@ xlog_recover_reorder_trans(
1601 case XFS_LI_QUOTAOFF: 1543 case XFS_LI_QUOTAOFF:
1602 case XFS_LI_EFD: 1544 case XFS_LI_EFD:
1603 case XFS_LI_EFI: 1545 case XFS_LI_EFI:
1604 xlog_recover_insert_item_backq(&trans->r_itemq, itemq); 1546 list_move_tail(&item->ri_list, &trans->r_itemq);
1605 break; 1547 break;
1606 default: 1548 default:
1607 xlog_warn( 1549 xlog_warn(
@@ -1609,8 +1551,8 @@ xlog_recover_reorder_trans(
1609 ASSERT(0); 1551 ASSERT(0);
1610 return XFS_ERROR(EIO); 1552 return XFS_ERROR(EIO);
1611 } 1553 }
1612 itemq = itemq_next; 1554 }
1613 } while (first_item != itemq); 1555 ASSERT(list_empty(&sort_list));
1614 return 0; 1556 return 0;
1615} 1557}
1616 1558
@@ -2242,9 +2184,9 @@ xlog_recover_do_buffer_trans(
2242 } 2184 }
2243 2185
2244 mp = log->l_mp; 2186 mp = log->l_mp;
2245 buf_flags = XFS_BUF_LOCK; 2187 buf_flags = XBF_LOCK;
2246 if (!(flags & XFS_BLI_INODE_BUF)) 2188 if (!(flags & XFS_BLI_INODE_BUF))
2247 buf_flags |= XFS_BUF_MAPPED; 2189 buf_flags |= XBF_MAPPED;
2248 2190
2249 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); 2191 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
2250 if (XFS_BUF_ISERROR(bp)) { 2192 if (XFS_BUF_ISERROR(bp)) {
@@ -2346,7 +2288,7 @@ xlog_recover_do_inode_trans(
2346 } 2288 }
2347 2289
2348 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 2290 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
2349 XFS_BUF_LOCK); 2291 XBF_LOCK);
2350 if (XFS_BUF_ISERROR(bp)) { 2292 if (XFS_BUF_ISERROR(bp)) {
2351 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, 2293 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
2352 bp, in_f->ilf_blkno); 2294 bp, in_f->ilf_blkno);
@@ -2814,14 +2756,13 @@ xlog_recover_do_trans(
2814 int pass) 2756 int pass)
2815{ 2757{
2816 int error = 0; 2758 int error = 0;
2817 xlog_recover_item_t *item, *first_item; 2759 xlog_recover_item_t *item;
2818 2760
2819 error = xlog_recover_reorder_trans(trans); 2761 error = xlog_recover_reorder_trans(trans);
2820 if (error) 2762 if (error)
2821 return error; 2763 return error;
2822 2764
2823 first_item = item = trans->r_itemq; 2765 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2824 do {
2825 switch (ITEM_TYPE(item)) { 2766 switch (ITEM_TYPE(item)) {
2826 case XFS_LI_BUF: 2767 case XFS_LI_BUF:
2827 error = xlog_recover_do_buffer_trans(log, item, pass); 2768 error = xlog_recover_do_buffer_trans(log, item, pass);
@@ -2854,8 +2795,7 @@ xlog_recover_do_trans(
2854 2795
2855 if (error) 2796 if (error)
2856 return error; 2797 return error;
2857 item = item->ri_next; 2798 }
2858 } while (first_item != item);
2859 2799
2860 return 0; 2800 return 0;
2861} 2801}
@@ -2869,21 +2809,18 @@ STATIC void
2869xlog_recover_free_trans( 2809xlog_recover_free_trans(
2870 xlog_recover_t *trans) 2810 xlog_recover_t *trans)
2871{ 2811{
2872 xlog_recover_item_t *first_item, *item, *free_item; 2812 xlog_recover_item_t *item, *n;
2873 int i; 2813 int i;
2874 2814
2875 item = first_item = trans->r_itemq; 2815 list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
2876 do { 2816 /* Free the regions in the item. */
2877 free_item = item; 2817 list_del(&item->ri_list);
2878 item = item->ri_next; 2818 for (i = 0; i < item->ri_cnt; i++)
2879 /* Free the regions in the item. */ 2819 kmem_free(item->ri_buf[i].i_addr);
2880 for (i = 0; i < free_item->ri_cnt; i++) {
2881 kmem_free(free_item->ri_buf[i].i_addr);
2882 }
2883 /* Free the item itself */ 2820 /* Free the item itself */
2884 kmem_free(free_item->ri_buf); 2821 kmem_free(item->ri_buf);
2885 kmem_free(free_item); 2822 kmem_free(item);
2886 } while (first_item != item); 2823 }
2887 /* Free the transaction recover structure */ 2824 /* Free the transaction recover structure */
2888 kmem_free(trans); 2825 kmem_free(trans);
2889} 2826}
@@ -2891,14 +2828,12 @@ xlog_recover_free_trans(
2891STATIC int 2828STATIC int
2892xlog_recover_commit_trans( 2829xlog_recover_commit_trans(
2893 xlog_t *log, 2830 xlog_t *log,
2894 xlog_recover_t **q,
2895 xlog_recover_t *trans, 2831 xlog_recover_t *trans,
2896 int pass) 2832 int pass)
2897{ 2833{
2898 int error; 2834 int error;
2899 2835
2900 if ((error = xlog_recover_unlink_tid(q, trans))) 2836 hlist_del(&trans->r_list);
2901 return error;
2902 if ((error = xlog_recover_do_trans(log, trans, pass))) 2837 if ((error = xlog_recover_do_trans(log, trans, pass)))
2903 return error; 2838 return error;
2904 xlog_recover_free_trans(trans); /* no error */ 2839 xlog_recover_free_trans(trans); /* no error */
@@ -2926,7 +2861,7 @@ xlog_recover_unmount_trans(
2926STATIC int 2861STATIC int
2927xlog_recover_process_data( 2862xlog_recover_process_data(
2928 xlog_t *log, 2863 xlog_t *log,
2929 xlog_recover_t *rhash[], 2864 struct hlist_head rhash[],
2930 xlog_rec_header_t *rhead, 2865 xlog_rec_header_t *rhead,
2931 xfs_caddr_t dp, 2866 xfs_caddr_t dp,
2932 int pass) 2867 int pass)
@@ -2960,7 +2895,7 @@ xlog_recover_process_data(
2960 } 2895 }
2961 tid = be32_to_cpu(ohead->oh_tid); 2896 tid = be32_to_cpu(ohead->oh_tid);
2962 hash = XLOG_RHASH(tid); 2897 hash = XLOG_RHASH(tid);
2963 trans = xlog_recover_find_tid(rhash[hash], tid); 2898 trans = xlog_recover_find_tid(&rhash[hash], tid);
2964 if (trans == NULL) { /* not found; add new tid */ 2899 if (trans == NULL) { /* not found; add new tid */
2965 if (ohead->oh_flags & XLOG_START_TRANS) 2900 if (ohead->oh_flags & XLOG_START_TRANS)
2966 xlog_recover_new_tid(&rhash[hash], tid, 2901 xlog_recover_new_tid(&rhash[hash], tid,
@@ -2978,7 +2913,7 @@ xlog_recover_process_data(
2978 switch (flags) { 2913 switch (flags) {
2979 case XLOG_COMMIT_TRANS: 2914 case XLOG_COMMIT_TRANS:
2980 error = xlog_recover_commit_trans(log, 2915 error = xlog_recover_commit_trans(log,
2981 &rhash[hash], trans, pass); 2916 trans, pass);
2982 break; 2917 break;
2983 case XLOG_UNMOUNT_TRANS: 2918 case XLOG_UNMOUNT_TRANS:
2984 error = xlog_recover_unmount_trans(trans); 2919 error = xlog_recover_unmount_trans(trans);
@@ -3211,7 +3146,7 @@ xlog_recover_process_one_iunlink(
3211 /* 3146 /*
3212 * Get the on disk inode to find the next inode in the bucket. 3147 * Get the on disk inode to find the next inode in the bucket.
3213 */ 3148 */
3214 error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK); 3149 error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
3215 if (error) 3150 if (error)
3216 goto fail_iput; 3151 goto fail_iput;
3217 3152
@@ -3517,7 +3452,7 @@ xlog_do_recovery_pass(
3517 int error = 0, h_size; 3452 int error = 0, h_size;
3518 int bblks, split_bblks; 3453 int bblks, split_bblks;
3519 int hblks, split_hblks, wrapped_hblks; 3454 int hblks, split_hblks, wrapped_hblks;
3520 xlog_recover_t *rhash[XLOG_RHASH_SIZE]; 3455 struct hlist_head rhash[XLOG_RHASH_SIZE];
3521 3456
3522 ASSERT(head_blk != tail_blk); 3457 ASSERT(head_blk != tail_blk);
3523 3458
@@ -3978,8 +3913,7 @@ xlog_recover_finish(
3978 * case the unlink transactions would have problems 3913 * case the unlink transactions would have problems
3979 * pushing the EFIs out of the way. 3914 * pushing the EFIs out of the way.
3980 */ 3915 */
3981 xfs_log_force(log->l_mp, (xfs_lsn_t)0, 3916 xfs_log_force(log->l_mp, XFS_LOG_SYNC);
3982 (XFS_LOG_FORCE | XFS_LOG_SYNC));
3983 3917
3984 xlog_recover_process_iunlinks(log); 3918 xlog_recover_process_iunlinks(log);
3985 3919
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index b22545555301..75d749207258 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -35,22 +35,21 @@
35 * item headers are in ri_buf[0]. Additional buffers follow. 35 * item headers are in ri_buf[0]. Additional buffers follow.
36 */ 36 */
37typedef struct xlog_recover_item { 37typedef struct xlog_recover_item {
38 struct xlog_recover_item *ri_next; 38 struct list_head ri_list;
39 struct xlog_recover_item *ri_prev; 39 int ri_type;
40 int ri_type; 40 int ri_cnt; /* count of regions found */
41 int ri_cnt; /* count of regions found */ 41 int ri_total; /* total regions */
42 int ri_total; /* total regions */ 42 xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */
43 xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */
44} xlog_recover_item_t; 43} xlog_recover_item_t;
45 44
46struct xlog_tid; 45struct xlog_tid;
47typedef struct xlog_recover { 46typedef struct xlog_recover {
48 struct xlog_recover *r_next; 47 struct hlist_node r_list;
49 xlog_tid_t r_log_tid; /* log's transaction id */ 48 xlog_tid_t r_log_tid; /* log's transaction id */
50 xfs_trans_header_t r_theader; /* trans header for partial */ 49 xfs_trans_header_t r_theader; /* trans header for partial */
51 int r_state; /* not needed */ 50 int r_state; /* not needed */
52 xfs_lsn_t r_lsn; /* xact lsn */ 51 xfs_lsn_t r_lsn; /* xact lsn */
53 xlog_recover_item_t *r_itemq; /* q for items */ 52 struct list_head r_itemq; /* q for items */
54} xlog_recover_t; 53} xlog_recover_t;
55 54
56#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr) 55#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index eb403b40e120..e79b56b4bca6 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -201,6 +201,38 @@ xfs_uuid_unmount(
201 201
202 202
203/* 203/*
204 * Reference counting access wrappers to the perag structures.
205 */
206struct xfs_perag *
207xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
208{
209 struct xfs_perag *pag;
210 int ref = 0;
211
212 spin_lock(&mp->m_perag_lock);
213 pag = radix_tree_lookup(&mp->m_perag_tree, agno);
214 if (pag) {
215 ASSERT(atomic_read(&pag->pag_ref) >= 0);
216 /* catch leaks in the positive direction during testing */
217 ASSERT(atomic_read(&pag->pag_ref) < 1000);
218 ref = atomic_inc_return(&pag->pag_ref);
219 }
220 spin_unlock(&mp->m_perag_lock);
221 trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
222 return pag;
223}
224
225void
226xfs_perag_put(struct xfs_perag *pag)
227{
228 int ref;
229
230 ASSERT(atomic_read(&pag->pag_ref) > 0);
231 ref = atomic_dec_return(&pag->pag_ref);
232 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
233}
234
235/*
204 * Free up the resources associated with a mount structure. Assume that 236 * Free up the resources associated with a mount structure. Assume that
205 * the structure was initially zeroed, so we can tell which fields got 237 * the structure was initially zeroed, so we can tell which fields got
206 * initialized. 238 * initialized.
@@ -209,13 +241,16 @@ STATIC void
209xfs_free_perag( 241xfs_free_perag(
210 xfs_mount_t *mp) 242 xfs_mount_t *mp)
211{ 243{
212 if (mp->m_perag) { 244 xfs_agnumber_t agno;
213 int agno; 245 struct xfs_perag *pag;
214 246
215 for (agno = 0; agno < mp->m_maxagi; agno++) 247 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
216 if (mp->m_perag[agno].pagb_list) 248 spin_lock(&mp->m_perag_lock);
217 kmem_free(mp->m_perag[agno].pagb_list); 249 pag = radix_tree_delete(&mp->m_perag_tree, agno);
218 kmem_free(mp->m_perag); 250 ASSERT(pag);
251 ASSERT(atomic_read(&pag->pag_ref) == 0);
252 spin_unlock(&mp->m_perag_lock);
253 kmem_free(pag);
219 } 254 }
220} 255}
221 256
@@ -389,22 +424,57 @@ xfs_initialize_perag_icache(
389 } 424 }
390} 425}
391 426
392xfs_agnumber_t 427int
393xfs_initialize_perag( 428xfs_initialize_perag(
394 xfs_mount_t *mp, 429 xfs_mount_t *mp,
395 xfs_agnumber_t agcount) 430 xfs_agnumber_t agcount,
431 xfs_agnumber_t *maxagi)
396{ 432{
397 xfs_agnumber_t index, max_metadata; 433 xfs_agnumber_t index, max_metadata;
434 xfs_agnumber_t first_initialised = 0;
398 xfs_perag_t *pag; 435 xfs_perag_t *pag;
399 xfs_agino_t agino; 436 xfs_agino_t agino;
400 xfs_ino_t ino; 437 xfs_ino_t ino;
401 xfs_sb_t *sbp = &mp->m_sb; 438 xfs_sb_t *sbp = &mp->m_sb;
402 xfs_ino_t max_inum = XFS_MAXINUMBER_32; 439 xfs_ino_t max_inum = XFS_MAXINUMBER_32;
440 int error = -ENOMEM;
403 441
404 /* Check to see if the filesystem can overflow 32 bit inodes */ 442 /* Check to see if the filesystem can overflow 32 bit inodes */
405 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); 443 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
406 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); 444 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
407 445
446 /*
447 * Walk the current per-ag tree so we don't try to initialise AGs
448 * that already exist (growfs case). Allocate and insert all the
449 * AGs we don't find ready for initialisation.
450 */
451 for (index = 0; index < agcount; index++) {
452 pag = xfs_perag_get(mp, index);
453 if (pag) {
454 xfs_perag_put(pag);
455 continue;
456 }
457 if (!first_initialised)
458 first_initialised = index;
459 pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
460 if (!pag)
461 goto out_unwind;
462 if (radix_tree_preload(GFP_NOFS))
463 goto out_unwind;
464 spin_lock(&mp->m_perag_lock);
465 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
466 BUG();
467 spin_unlock(&mp->m_perag_lock);
468 radix_tree_preload_end();
469 error = -EEXIST;
470 goto out_unwind;
471 }
472 pag->pag_agno = index;
473 pag->pag_mount = mp;
474 spin_unlock(&mp->m_perag_lock);
475 radix_tree_preload_end();
476 }
477
408 /* Clear the mount flag if no inode can overflow 32 bits 478 /* Clear the mount flag if no inode can overflow 32 bits
409 * on this filesystem, or if specifically requested.. 479 * on this filesystem, or if specifically requested..
410 */ 480 */
@@ -438,21 +508,33 @@ xfs_initialize_perag(
438 } 508 }
439 509
440 /* This ag is preferred for inodes */ 510 /* This ag is preferred for inodes */
441 pag = &mp->m_perag[index]; 511 pag = xfs_perag_get(mp, index);
442 pag->pagi_inodeok = 1; 512 pag->pagi_inodeok = 1;
443 if (index < max_metadata) 513 if (index < max_metadata)
444 pag->pagf_metadata = 1; 514 pag->pagf_metadata = 1;
445 xfs_initialize_perag_icache(pag); 515 xfs_initialize_perag_icache(pag);
516 xfs_perag_put(pag);
446 } 517 }
447 } else { 518 } else {
448 /* Setup default behavior for smaller filesystems */ 519 /* Setup default behavior for smaller filesystems */
449 for (index = 0; index < agcount; index++) { 520 for (index = 0; index < agcount; index++) {
450 pag = &mp->m_perag[index]; 521 pag = xfs_perag_get(mp, index);
451 pag->pagi_inodeok = 1; 522 pag->pagi_inodeok = 1;
452 xfs_initialize_perag_icache(pag); 523 xfs_initialize_perag_icache(pag);
524 xfs_perag_put(pag);
453 } 525 }
454 } 526 }
455 return index; 527 if (maxagi)
528 *maxagi = index;
529 return 0;
530
531out_unwind:
532 kmem_free(pag);
533 for (; index > first_initialised; index--) {
534 pag = radix_tree_delete(&mp->m_perag_tree, index);
535 kmem_free(pag);
536 }
537 return error;
456} 538}
457 539
458void 540void
@@ -583,7 +665,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
583 * access to the superblock. 665 * access to the superblock.
584 */ 666 */
585 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); 667 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
586 extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED; 668 extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
587 669
588 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), 670 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size),
589 extra_flags); 671 extra_flags);
@@ -731,12 +813,13 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
731 error = xfs_ialloc_pagi_init(mp, NULL, index); 813 error = xfs_ialloc_pagi_init(mp, NULL, index);
732 if (error) 814 if (error)
733 return error; 815 return error;
734 pag = &mp->m_perag[index]; 816 pag = xfs_perag_get(mp, index);
735 ifree += pag->pagi_freecount; 817 ifree += pag->pagi_freecount;
736 ialloc += pag->pagi_count; 818 ialloc += pag->pagi_count;
737 bfree += pag->pagf_freeblks; 819 bfree += pag->pagf_freeblks;
738 bfreelst += pag->pagf_flcount; 820 bfreelst += pag->pagf_flcount;
739 btree += pag->pagf_btreeblks; 821 btree += pag->pagf_btreeblks;
822 xfs_perag_put(pag);
740 } 823 }
741 /* 824 /*
742 * Overwrite incore superblock counters with just-read data 825 * Overwrite incore superblock counters with just-read data
@@ -1008,6 +1091,24 @@ xfs_mount_reset_sbqflags(
1008 return xfs_trans_commit(tp, 0); 1091 return xfs_trans_commit(tp, 0);
1009} 1092}
1010 1093
1094__uint64_t
1095xfs_default_resblks(xfs_mount_t *mp)
1096{
1097 __uint64_t resblks;
1098
1099 /*
1100 * We default to 5% or 8192 fsbs of space reserved, whichever is
1101 * smaller. This is intended to cover concurrent allocation
1102 * transactions when we initially hit enospc. These each require a 4
1103 * block reservation. Hence by default we cover roughly 2000 concurrent
1104 * allocation reservations.
1105 */
1106 resblks = mp->m_sb.sb_dblocks;
1107 do_div(resblks, 20);
1108 resblks = min_t(__uint64_t, resblks, 8192);
1109 return resblks;
1110}
1111
1011/* 1112/*
1012 * This function does the following on an initial mount of a file system: 1113 * This function does the following on an initial mount of a file system:
1013 * - reads the superblock from disk and init the mount struct 1114 * - reads the superblock from disk and init the mount struct
@@ -1152,13 +1253,13 @@ xfs_mountfs(
1152 /* 1253 /*
1153 * Allocate and initialize the per-ag data. 1254 * Allocate and initialize the per-ag data.
1154 */ 1255 */
1155 init_rwsem(&mp->m_peraglock); 1256 spin_lock_init(&mp->m_perag_lock);
1156 mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), 1257 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS);
1157 KM_MAYFAIL); 1258 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
1158 if (!mp->m_perag) 1259 if (error) {
1260 cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
1159 goto out_remove_uuid; 1261 goto out_remove_uuid;
1160 1262 }
1161 mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
1162 1263
1163 if (!sbp->sb_logblocks) { 1264 if (!sbp->sb_logblocks) {
1164 cmn_err(CE_WARN, "XFS: no log defined"); 1265 cmn_err(CE_WARN, "XFS: no log defined");
@@ -1319,17 +1420,16 @@ xfs_mountfs(
1319 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations 1420 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
1320 * are not allowed to use this reserved space. 1421 * are not allowed to use this reserved space.
1321 * 1422 *
1322 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
1323 * This may drive us straight to ENOSPC on mount, but that implies 1423 * This may drive us straight to ENOSPC on mount, but that implies
1324 * we were already there on the last unmount. Warn if this occurs. 1424 * we were already there on the last unmount. Warn if this occurs.
1325 */ 1425 */
1326 resblks = mp->m_sb.sb_dblocks; 1426 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
1327 do_div(resblks, 20); 1427 resblks = xfs_default_resblks(mp);
1328 resblks = min_t(__uint64_t, resblks, 1024); 1428 error = xfs_reserve_blocks(mp, &resblks, NULL);
1329 error = xfs_reserve_blocks(mp, &resblks, NULL); 1429 if (error)
1330 if (error) 1430 cmn_err(CE_WARN, "XFS: Unable to allocate reserve "
1331 cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. " 1431 "blocks. Continuing without a reserve pool.");
1332 "Continuing without a reserve pool."); 1432 }
1333 1433
1334 return 0; 1434 return 0;
1335 1435
@@ -1372,8 +1472,19 @@ xfs_unmountfs(
1372 * push out the iclog we will never get that unlocked. hence we 1472 * push out the iclog we will never get that unlocked. hence we
1373 * need to force the log first. 1473 * need to force the log first.
1374 */ 1474 */
1375 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1475 xfs_log_force(mp, XFS_LOG_SYNC);
1376 xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC); 1476
1477 /*
1478 * Do a delwri reclaim pass first so that as many dirty inodes are
1479 * queued up for IO as possible. Then flush the buffers before making
1480 * a synchronous path to catch all the remaining inodes are reclaimed.
1481 * This makes the reclaim process as quick as possible by avoiding
1482 * synchronous writeout and blocking on inodes already in the delwri
1483 * state as much as possible.
1484 */
1485 xfs_reclaim_inodes(mp, 0);
1486 XFS_bflush(mp->m_ddev_targp);
1487 xfs_reclaim_inodes(mp, SYNC_WAIT);
1377 1488
1378 xfs_qm_unmount(mp); 1489 xfs_qm_unmount(mp);
1379 1490
@@ -1382,7 +1493,7 @@ xfs_unmountfs(
1382 * that nothing is pinned. This is important because bflush() 1493 * that nothing is pinned. This is important because bflush()
1383 * will skip pinned buffers. 1494 * will skip pinned buffers.
1384 */ 1495 */
1385 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); 1496 xfs_log_force(mp, XFS_LOG_SYNC);
1386 1497
1387 xfs_binval(mp->m_ddev_targp); 1498 xfs_binval(mp->m_ddev_targp);
1388 if (mp->m_rtdev_targp) { 1499 if (mp->m_rtdev_targp) {
@@ -1548,15 +1659,14 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
1548 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields); 1659 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
1549 1660
1550 /* find modified range */ 1661 /* find modified range */
1662 f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
1663 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1664 last = xfs_sb_info[f + 1].offset - 1;
1551 1665
1552 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); 1666 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
1553 ASSERT((1LL << f) & XFS_SB_MOD_BITS); 1667 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1554 first = xfs_sb_info[f].offset; 1668 first = xfs_sb_info[f].offset;
1555 1669
1556 f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
1557 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1558 last = xfs_sb_info[f + 1].offset - 1;
1559
1560 xfs_trans_log_buf(tp, bp, first, last); 1670 xfs_trans_log_buf(tp, bp, first, last);
1561} 1671}
1562 1672
@@ -1620,26 +1730,30 @@ xfs_mod_incore_sb_unlocked(
1620 lcounter += rem; 1730 lcounter += rem;
1621 } 1731 }
1622 } else { /* Taking blocks away */ 1732 } else { /* Taking blocks away */
1623
1624 lcounter += delta; 1733 lcounter += delta;
1734 if (lcounter >= 0) {
1735 mp->m_sb.sb_fdblocks = lcounter +
1736 XFS_ALLOC_SET_ASIDE(mp);
1737 return 0;
1738 }
1625 1739
1626 /* 1740 /*
1627 * If were out of blocks, use any available reserved blocks if 1741 * We are out of blocks, use any available reserved
1628 * were allowed to. 1742 * blocks if were allowed to.
1629 */ 1743 */
1744 if (!rsvd)
1745 return XFS_ERROR(ENOSPC);
1630 1746
1631 if (lcounter < 0) { 1747 lcounter = (long long)mp->m_resblks_avail + delta;
1632 if (rsvd) { 1748 if (lcounter >= 0) {
1633 lcounter = (long long)mp->m_resblks_avail + delta; 1749 mp->m_resblks_avail = lcounter;
1634 if (lcounter < 0) { 1750 return 0;
1635 return XFS_ERROR(ENOSPC);
1636 }
1637 mp->m_resblks_avail = lcounter;
1638 return 0;
1639 } else { /* not reserved */
1640 return XFS_ERROR(ENOSPC);
1641 }
1642 } 1751 }
1752 printk_once(KERN_WARNING
1753 "Filesystem \"%s\": reserve blocks depleted! "
1754 "Consider increasing reserve pool size.",
1755 mp->m_fsname);
1756 return XFS_ERROR(ENOSPC);
1643 } 1757 }
1644 1758
1645 mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); 1759 mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
@@ -1887,7 +2001,7 @@ xfs_getsb(
1887 2001
1888 ASSERT(mp->m_sb_bp != NULL); 2002 ASSERT(mp->m_sb_bp != NULL);
1889 bp = mp->m_sb_bp; 2003 bp = mp->m_sb_bp;
1890 if (flags & XFS_BUF_TRYLOCK) { 2004 if (flags & XBF_TRYLOCK) {
1891 if (!XFS_BUF_CPSEMA(bp)) { 2005 if (!XFS_BUF_CPSEMA(bp)) {
1892 return NULL; 2006 return NULL;
1893 } 2007 }
@@ -1947,6 +2061,26 @@ xfs_mount_log_sb(
1947 return error; 2061 return error;
1948} 2062}
1949 2063
2064/*
2065 * If the underlying (data/log/rt) device is readonly, there are some
2066 * operations that cannot proceed.
2067 */
2068int
2069xfs_dev_is_read_only(
2070 struct xfs_mount *mp,
2071 char *message)
2072{
2073 if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
2074 xfs_readonly_buftarg(mp->m_logdev_targp) ||
2075 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
2076 cmn_err(CE_NOTE,
2077 "XFS: %s required on read-only device.", message);
2078 cmn_err(CE_NOTE,
2079 "XFS: write access unavailable, cannot proceed.");
2080 return EROFS;
2081 }
2082 return 0;
2083}
1950 2084
1951#ifdef HAVE_PERCPU_SB 2085#ifdef HAVE_PERCPU_SB
1952/* 2086/*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1df7e4502967..4fa0bc7b983e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -78,7 +78,8 @@ typedef int (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
78typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *, 78typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
79 struct xfs_inode *, dm_right_t, 79 struct xfs_inode *, dm_right_t,
80 struct xfs_inode *, dm_right_t, 80 struct xfs_inode *, dm_right_t,
81 const char *, const char *, mode_t, int, int); 81 const unsigned char *, const unsigned char *,
82 mode_t, int, int);
82typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t, 83typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
83 char *, char *); 84 char *, char *);
84typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *, 85typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
@@ -207,8 +208,8 @@ typedef struct xfs_mount {
207 uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ 208 uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
208 uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ 209 uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
209 uint m_in_maxlevels; /* max inobt btree levels. */ 210 uint m_in_maxlevels; /* max inobt btree levels. */
210 struct xfs_perag *m_perag; /* per-ag accounting info */ 211 struct radix_tree_root m_perag_tree; /* per-ag accounting info */
211 struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ 212 spinlock_t m_perag_lock; /* lock for m_perag_tree */
212 struct mutex m_growlock; /* growfs mutex */ 213 struct mutex m_growlock; /* growfs mutex */
213 int m_fixedfsid[2]; /* unchanged for life of FS */ 214 int m_fixedfsid[2]; /* unchanged for life of FS */
214 uint m_dmevmask; /* DMI events for this FS */ 215 uint m_dmevmask; /* DMI events for this FS */
@@ -224,6 +225,7 @@ typedef struct xfs_mount {
224 __uint64_t m_maxioffset; /* maximum inode offset */ 225 __uint64_t m_maxioffset; /* maximum inode offset */
225 __uint64_t m_resblks; /* total reserved blocks */ 226 __uint64_t m_resblks; /* total reserved blocks */
226 __uint64_t m_resblks_avail;/* available reserved blocks */ 227 __uint64_t m_resblks_avail;/* available reserved blocks */
228 __uint64_t m_resblks_save; /* reserved blks @ remount,ro */
227 int m_dalign; /* stripe unit */ 229 int m_dalign; /* stripe unit */
228 int m_swidth; /* stripe width */ 230 int m_swidth; /* stripe width */
229 int m_sinoalign; /* stripe unit inode alignment */ 231 int m_sinoalign; /* stripe unit inode alignment */
@@ -243,7 +245,7 @@ typedef struct xfs_mount {
243 struct xfs_qmops *m_qm_ops; /* vector of XQM ops */ 245 struct xfs_qmops *m_qm_ops; /* vector of XQM ops */
244 atomic_t m_active_trans; /* number trans frozen */ 246 atomic_t m_active_trans; /* number trans frozen */
245#ifdef HAVE_PERCPU_SB 247#ifdef HAVE_PERCPU_SB
246 xfs_icsb_cnts_t *m_sb_cnts; /* per-cpu superblock counters */ 248 xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
247 unsigned long m_icsb_counters; /* disabled per-cpu counters */ 249 unsigned long m_icsb_counters; /* disabled per-cpu counters */
248 struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ 250 struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */
249 struct mutex m_icsb_mutex; /* balancer sync lock */ 251 struct mutex m_icsb_mutex; /* balancer sync lock */
@@ -384,19 +386,10 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
384} 386}
385 387
386/* 388/*
387 * perag get/put wrappers for eventual ref counting 389 * perag get/put wrappers for ref counting
388 */ 390 */
389static inline xfs_perag_t * 391struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
390xfs_get_perag(struct xfs_mount *mp, xfs_ino_t ino) 392void xfs_perag_put(struct xfs_perag *pag);
391{
392 return &mp->m_perag[XFS_INO_TO_AGNO(mp, ino)];
393}
394
395static inline void
396xfs_put_perag(struct xfs_mount *mp, xfs_perag_t *pag)
397{
398 /* nothing to see here, move along */
399}
400 393
401/* 394/*
402 * Per-cpu superblock locking functions 395 * Per-cpu superblock locking functions
@@ -428,6 +421,7 @@ typedef struct xfs_mod_sb {
428} xfs_mod_sb_t; 421} xfs_mod_sb_t;
429 422
430extern int xfs_log_sbcount(xfs_mount_t *, uint); 423extern int xfs_log_sbcount(xfs_mount_t *, uint);
424extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
431extern int xfs_mountfs(xfs_mount_t *mp); 425extern int xfs_mountfs(xfs_mount_t *mp);
432 426
433extern void xfs_unmountfs(xfs_mount_t *); 427extern void xfs_unmountfs(xfs_mount_t *);
@@ -442,6 +436,8 @@ extern void xfs_freesb(xfs_mount_t *);
442extern int xfs_fs_writable(xfs_mount_t *); 436extern int xfs_fs_writable(xfs_mount_t *);
443extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); 437extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
444 438
439extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
440
445extern int xfs_dmops_get(struct xfs_mount *); 441extern int xfs_dmops_get(struct xfs_mount *);
446extern void xfs_dmops_put(struct xfs_mount *); 442extern void xfs_dmops_put(struct xfs_mount *);
447 443
@@ -450,7 +446,8 @@ extern struct xfs_dmops xfs_dmcore_xfs;
450#endif /* __KERNEL__ */ 446#endif /* __KERNEL__ */
451 447
452extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 448extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
453extern xfs_agnumber_t xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t); 449extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
450 xfs_agnumber_t *);
454extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); 451extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
455extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); 452extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
456 453
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 4b0613d99faa..45ce15dc5b2b 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -398,7 +398,7 @@ exit:
398 * guaranteed that all the free functions for all the elements have finished 398 * guaranteed that all the free functions for all the elements have finished
399 * executing and the reaper is not running. 399 * executing and the reaper is not running.
400 */ 400 */
401void 401static void
402xfs_mru_cache_flush( 402xfs_mru_cache_flush(
403 xfs_mru_cache_t *mru) 403 xfs_mru_cache_t *mru)
404{ 404{
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index 5d439f34b0c9..36dd3ec8b4eb 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -42,7 +42,6 @@ void xfs_mru_cache_uninit(void);
42int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, 42int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
43 unsigned int grp_count, 43 unsigned int grp_count,
44 xfs_mru_cache_free_func_t free_func); 44 xfs_mru_cache_free_func_t free_func);
45void xfs_mru_cache_flush(xfs_mru_cache_t *mru);
46void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); 45void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
47int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, 46int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
48 void *value); 47 void *value);
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 91bfd60f4c74..fdcab3f81dde 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -223,16 +223,9 @@ typedef struct xfs_qoff_logformat {
223#define XFS_QMOPT_RES_INOS 0x0800000 223#define XFS_QMOPT_RES_INOS 0x0800000
224 224
225/* 225/*
226 * flags for dqflush and dqflush_all.
227 */
228#define XFS_QMOPT_SYNC 0x1000000
229#define XFS_QMOPT_ASYNC 0x2000000
230#define XFS_QMOPT_DELWRI 0x4000000
231
232/*
233 * flags for dqalloc. 226 * flags for dqalloc.
234 */ 227 */
235#define XFS_QMOPT_INHERIT 0x8000000 228#define XFS_QMOPT_INHERIT 0x1000000
236 229
237/* 230/*
238 * flags to xfs_trans_mod_dquot. 231 * flags to xfs_trans_mod_dquot.
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 9e15a1185362..6be05f756d59 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1517,6 +1517,8 @@ xfs_rtfree_range(
1517 */ 1517 */
1518 error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, 1518 error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
1519 &postblock); 1519 &postblock);
1520 if (error)
1521 return error;
1520 /* 1522 /*
1521 * If there are blocks not being freed at the front of the 1523 * If there are blocks not being freed at the front of the
1522 * old extent, add summary data for them to be allocated. 1524 * old extent, add summary data for them to be allocated.
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 5aa07caea5f1..e336742a58a4 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -47,48 +47,6 @@
47#include "xfs_trace.h" 47#include "xfs_trace.h"
48 48
49/* 49/*
50 * This is a subroutine for xfs_write() and other writers (xfs_ioctl)
51 * which clears the setuid and setgid bits when a file is written.
52 */
53int
54xfs_write_clear_setuid(
55 xfs_inode_t *ip)
56{
57 xfs_mount_t *mp;
58 xfs_trans_t *tp;
59 int error;
60
61 mp = ip->i_mount;
62 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
63 if ((error = xfs_trans_reserve(tp, 0,
64 XFS_WRITEID_LOG_RES(mp),
65 0, 0, 0))) {
66 xfs_trans_cancel(tp, 0);
67 return error;
68 }
69 xfs_ilock(ip, XFS_ILOCK_EXCL);
70 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
71 xfs_trans_ihold(tp, ip);
72 ip->i_d.di_mode &= ~S_ISUID;
73
74 /*
75 * Note that we don't have to worry about mandatory
76 * file locking being disabled here because we only
77 * clear the S_ISGID bit if the Group execute bit is
78 * on, but if it was on then mandatory locking wouldn't
79 * have been enabled.
80 */
81 if (ip->i_d.di_mode & S_IXGRP) {
82 ip->i_d.di_mode &= ~S_ISGID;
83 }
84 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
85 xfs_trans_set_sync(tp);
86 error = xfs_trans_commit(tp, 0);
87 xfs_iunlock(ip, XFS_ILOCK_EXCL);
88 return 0;
89}
90
91/*
92 * Force a shutdown of the filesystem instantly while keeping 50 * Force a shutdown of the filesystem instantly while keeping
93 * the filesystem consistent. We don't do an unmount here; just shutdown 51 * the filesystem consistent. We don't do an unmount here; just shutdown
94 * the shop, make sure that absolutely nothing persistent happens to 52 * the shop, make sure that absolutely nothing persistent happens to
@@ -153,88 +111,6 @@ xfs_do_force_shutdown(
153 } 111 }
154} 112}
155 113
156
157/*
158 * Called when we want to stop a buffer from getting written or read.
159 * We attach the EIO error, muck with its flags, and call biodone
160 * so that the proper iodone callbacks get called.
161 */
162int
163xfs_bioerror(
164 xfs_buf_t *bp)
165{
166
167#ifdef XFSERRORDEBUG
168 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
169#endif
170
171 /*
172 * No need to wait until the buffer is unpinned.
173 * We aren't flushing it.
174 */
175 XFS_BUF_ERROR(bp, EIO);
176 /*
177 * We're calling biodone, so delete B_DONE flag. Either way
178 * we have to call the iodone callback, and calling biodone
179 * probably is the best way since it takes care of
180 * GRIO as well.
181 */
182 XFS_BUF_UNREAD(bp);
183 XFS_BUF_UNDELAYWRITE(bp);
184 XFS_BUF_UNDONE(bp);
185 XFS_BUF_STALE(bp);
186
187 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
188 xfs_biodone(bp);
189
190 return (EIO);
191}
192
193/*
194 * Same as xfs_bioerror, except that we are releasing the buffer
195 * here ourselves, and avoiding the biodone call.
196 * This is meant for userdata errors; metadata bufs come with
197 * iodone functions attached, so that we can track down errors.
198 */
199int
200xfs_bioerror_relse(
201 xfs_buf_t *bp)
202{
203 int64_t fl;
204
205 ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks);
206 ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone);
207
208 fl = XFS_BUF_BFLAGS(bp);
209 /*
210 * No need to wait until the buffer is unpinned.
211 * We aren't flushing it.
212 *
213 * chunkhold expects B_DONE to be set, whether
214 * we actually finish the I/O or not. We don't want to
215 * change that interface.
216 */
217 XFS_BUF_UNREAD(bp);
218 XFS_BUF_UNDELAYWRITE(bp);
219 XFS_BUF_DONE(bp);
220 XFS_BUF_STALE(bp);
221 XFS_BUF_CLR_IODONE_FUNC(bp);
222 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
223 if (!(fl & XFS_B_ASYNC)) {
224 /*
225 * Mark b_error and B_ERROR _both_.
226 * Lot's of chunkcache code assumes that.
227 * There's no reason to mark error for
228 * ASYNC buffers.
229 */
230 XFS_BUF_ERROR(bp, EIO);
231 XFS_BUF_FINISH_IOWAIT(bp);
232 } else {
233 xfs_buf_relse(bp);
234 }
235 return (EIO);
236}
237
238/* 114/*
239 * Prints out an ALERT message about I/O error. 115 * Prints out an ALERT message about I/O error.
240 */ 116 */
@@ -306,37 +182,6 @@ xfs_read_buf(
306} 182}
307 183
308/* 184/*
309 * Wrapper around bwrite() so that we can trap
310 * write errors, and act accordingly.
311 */
312int
313xfs_bwrite(
314 struct xfs_mount *mp,
315 struct xfs_buf *bp)
316{
317 int error;
318
319 /*
320 * XXXsup how does this work for quotas.
321 */
322 XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
323 bp->b_mount = mp;
324 XFS_BUF_WRITE(bp);
325
326 if ((error = XFS_bwrite(bp))) {
327 ASSERT(mp);
328 /*
329 * Cannot put a buftrace here since if the buffer is not
330 * B_HOLD then we will brelse() the buffer before returning
331 * from bwrite and we could be tracing a buffer that has
332 * been reused.
333 */
334 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
335 }
336 return (error);
337}
338
339/*
340 * helper function to extract extent size hint from inode 185 * helper function to extract extent size hint from inode
341 */ 186 */
342xfs_extlen_t 187xfs_extlen_t
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index 571f2174435c..11c41ec6ed75 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -39,10 +39,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
39/* 39/*
40 * Prototypes for functions in xfs_rw.c. 40 * Prototypes for functions in xfs_rw.c.
41 */ 41 */
42extern int xfs_write_clear_setuid(struct xfs_inode *ip);
43extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
44extern int xfs_bioerror(struct xfs_buf *bp);
45extern int xfs_bioerror_relse(struct xfs_buf *bp);
46extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, 42extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
47 xfs_daddr_t blkno, int len, uint flags, 43 xfs_daddr_t blkno, int len, uint flags,
48 struct xfs_buf **bpp); 44 struct xfs_buf **bpp);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 237badcbac3b..f73e358bae8d 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -796,7 +796,7 @@ _xfs_trans_commit(
796 int sync; 796 int sync;
797#define XFS_TRANS_LOGVEC_COUNT 16 797#define XFS_TRANS_LOGVEC_COUNT 16
798 xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; 798 xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
799 void *commit_iclog; 799 struct xlog_in_core *commit_iclog;
800 int shutdown; 800 int shutdown;
801 801
802 commit_lsn = -1; 802 commit_lsn = -1;
@@ -981,9 +981,8 @@ shut_us_down:
981 */ 981 */
982 if (sync) { 982 if (sync) {
983 if (!error) { 983 if (!error) {
984 error = _xfs_log_force(mp, commit_lsn, 984 error = _xfs_log_force_lsn(mp, commit_lsn,
985 XFS_LOG_FORCE | XFS_LOG_SYNC, 985 XFS_LOG_SYNC, log_flushed);
986 log_flushed);
987 } 986 }
988 XFS_STATS_INC(xs_trans_sync); 987 XFS_STATS_INC(xs_trans_sync);
989 } else { 988 } else {
@@ -1121,7 +1120,7 @@ xfs_trans_fill_vecs(
1121 tp->t_header.th_num_items = nitems; 1120 tp->t_header.th_num_items = nitems;
1122 log_vector->i_addr = (xfs_caddr_t)&tp->t_header; 1121 log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
1123 log_vector->i_len = sizeof(xfs_trans_header_t); 1122 log_vector->i_len = sizeof(xfs_trans_header_t);
1124 XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_TRANSHDR); 1123 log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
1125} 1124}
1126 1125
1127 1126
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index ca64f33c63a3..79c8bab9dfff 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -861,8 +861,7 @@ typedef struct xfs_item_ops {
861#define XFS_ITEM_SUCCESS 0 861#define XFS_ITEM_SUCCESS 0
862#define XFS_ITEM_PINNED 1 862#define XFS_ITEM_PINNED 1
863#define XFS_ITEM_LOCKED 2 863#define XFS_ITEM_LOCKED 2
864#define XFS_ITEM_FLUSHING 3 864#define XFS_ITEM_PUSHBUF 3
865#define XFS_ITEM_PUSHBUF 4
866 865
867/* 866/*
868 * This structure is used to maintain a list of block ranges that have been 867 * This structure is used to maintain a list of block ranges that have been
@@ -911,7 +910,7 @@ typedef struct xfs_trans {
911 unsigned int t_blk_res_used; /* # of resvd blocks used */ 910 unsigned int t_blk_res_used; /* # of resvd blocks used */
912 unsigned int t_rtx_res; /* # of rt extents resvd */ 911 unsigned int t_rtx_res; /* # of rt extents resvd */
913 unsigned int t_rtx_res_used; /* # of resvd rt extents used */ 912 unsigned int t_rtx_res_used; /* # of resvd rt extents used */
914 xfs_log_ticket_t t_ticket; /* log mgr ticket */ 913 struct xlog_ticket *t_ticket; /* log mgr ticket */
915 xfs_lsn_t t_lsn; /* log seq num of start of 914 xfs_lsn_t t_lsn; /* log seq num of start of
916 * transaction. */ 915 * transaction. */
917 xfs_lsn_t t_commit_lsn; /* log seq num of end of 916 xfs_lsn_t t_commit_lsn; /* log seq num of end of
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2ffc570679be..e799824f7245 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -237,14 +237,15 @@ out:
237} 237}
238 238
239/* 239/*
240 * Function that does the work of pushing on the AIL 240 * xfsaild_push does the work of pushing on the AIL. Returning a timeout of
241 * zero indicates that the caller should sleep until woken.
241 */ 242 */
242long 243long
243xfsaild_push( 244xfsaild_push(
244 struct xfs_ail *ailp, 245 struct xfs_ail *ailp,
245 xfs_lsn_t *last_lsn) 246 xfs_lsn_t *last_lsn)
246{ 247{
247 long tout = 1000; /* milliseconds */ 248 long tout = 0;
248 xfs_lsn_t last_pushed_lsn = *last_lsn; 249 xfs_lsn_t last_pushed_lsn = *last_lsn;
249 xfs_lsn_t target = ailp->xa_target; 250 xfs_lsn_t target = ailp->xa_target;
250 xfs_lsn_t lsn; 251 xfs_lsn_t lsn;
@@ -252,6 +253,7 @@ xfsaild_push(
252 int flush_log, count, stuck; 253 int flush_log, count, stuck;
253 xfs_mount_t *mp = ailp->xa_mount; 254 xfs_mount_t *mp = ailp->xa_mount;
254 struct xfs_ail_cursor *cur = &ailp->xa_cursors; 255 struct xfs_ail_cursor *cur = &ailp->xa_cursors;
256 int push_xfsbufd = 0;
255 257
256 spin_lock(&ailp->xa_lock); 258 spin_lock(&ailp->xa_lock);
257 xfs_trans_ail_cursor_init(ailp, cur); 259 xfs_trans_ail_cursor_init(ailp, cur);
@@ -262,7 +264,7 @@ xfsaild_push(
262 */ 264 */
263 xfs_trans_ail_cursor_done(ailp, cur); 265 xfs_trans_ail_cursor_done(ailp, cur);
264 spin_unlock(&ailp->xa_lock); 266 spin_unlock(&ailp->xa_lock);
265 last_pushed_lsn = 0; 267 *last_lsn = 0;
266 return tout; 268 return tout;
267 } 269 }
268 270
@@ -279,7 +281,6 @@ xfsaild_push(
279 * prevents use from spinning when we can't do anything or there is 281 * prevents use from spinning when we can't do anything or there is
280 * lots of contention on the AIL lists. 282 * lots of contention on the AIL lists.
281 */ 283 */
282 tout = 10;
283 lsn = lip->li_lsn; 284 lsn = lip->li_lsn;
284 flush_log = stuck = count = 0; 285 flush_log = stuck = count = 0;
285 while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) { 286 while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) {
@@ -308,6 +309,7 @@ xfsaild_push(
308 XFS_STATS_INC(xs_push_ail_pushbuf); 309 XFS_STATS_INC(xs_push_ail_pushbuf);
309 IOP_PUSHBUF(lip); 310 IOP_PUSHBUF(lip);
310 last_pushed_lsn = lsn; 311 last_pushed_lsn = lsn;
312 push_xfsbufd = 1;
311 break; 313 break;
312 314
313 case XFS_ITEM_PINNED: 315 case XFS_ITEM_PINNED:
@@ -322,12 +324,6 @@ xfsaild_push(
322 stuck++; 324 stuck++;
323 break; 325 break;
324 326
325 case XFS_ITEM_FLUSHING:
326 XFS_STATS_INC(xs_push_ail_flushing);
327 last_pushed_lsn = lsn;
328 stuck++;
329 break;
330
331 default: 327 default:
332 ASSERT(0); 328 ASSERT(0);
333 break; 329 break;
@@ -371,19 +367,24 @@ xfsaild_push(
371 * move forward in the AIL. 367 * move forward in the AIL.
372 */ 368 */
373 XFS_STATS_INC(xs_push_ail_flush); 369 XFS_STATS_INC(xs_push_ail_flush);
374 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 370 xfs_log_force(mp, 0);
371 }
372
373 if (push_xfsbufd) {
374 /* we've got delayed write buffers to flush */
375 wake_up_process(mp->m_ddev_targp->bt_task);
375 } 376 }
376 377
377 if (!count) { 378 if (!count) {
378 /* We're past our target or empty, so idle */ 379 /* We're past our target or empty, so idle */
379 tout = 1000; 380 last_pushed_lsn = 0;
380 } else if (XFS_LSN_CMP(lsn, target) >= 0) { 381 } else if (XFS_LSN_CMP(lsn, target) >= 0) {
381 /* 382 /*
382 * We reached the target so wait a bit longer for I/O to 383 * We reached the target so wait a bit longer for I/O to
383 * complete and remove pushed items from the AIL before we 384 * complete and remove pushed items from the AIL before we
384 * start the next scan from the start of the AIL. 385 * start the next scan from the start of the AIL.
385 */ 386 */
386 tout += 20; 387 tout = 50;
387 last_pushed_lsn = 0; 388 last_pushed_lsn = 0;
388 } else if ((stuck * 100) / count > 90) { 389 } else if ((stuck * 100) / count > 90) {
389 /* 390 /*
@@ -395,11 +396,14 @@ xfsaild_push(
395 * Backoff a bit more to allow some I/O to complete before 396 * Backoff a bit more to allow some I/O to complete before
396 * continuing from where we were. 397 * continuing from where we were.
397 */ 398 */
398 tout += 10; 399 tout = 20;
400 } else {
401 /* more to do, but wait a short while before continuing */
402 tout = 10;
399 } 403 }
400 *last_lsn = last_pushed_lsn; 404 *last_lsn = last_pushed_lsn;
401 return tout; 405 return tout;
402} /* xfsaild_push */ 406}
403 407
404 408
405/* 409/*
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 49130628d5ef..fb586360d1c9 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -46,6 +46,65 @@ STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
46STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *, 46STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
47 xfs_daddr_t, int); 47 xfs_daddr_t, int);
48 48
49/*
50 * Add the locked buffer to the transaction.
51 *
52 * The buffer must be locked, and it cannot be associated with any
53 * transaction.
54 *
55 * If the buffer does not yet have a buf log item associated with it,
56 * then allocate one for it. Then add the buf item to the transaction.
57 */
58STATIC void
59_xfs_trans_bjoin(
60 struct xfs_trans *tp,
61 struct xfs_buf *bp,
62 int reset_recur)
63{
64 struct xfs_buf_log_item *bip;
65
66 ASSERT(XFS_BUF_ISBUSY(bp));
67 ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
68
69 /*
70 * The xfs_buf_log_item pointer is stored in b_fsprivate. If
71 * it doesn't have one yet, then allocate one and initialize it.
72 * The checks to see if one is there are in xfs_buf_item_init().
73 */
74 xfs_buf_item_init(bp, tp->t_mountp);
75 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
76 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
77 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
78 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
79 if (reset_recur)
80 bip->bli_recur = 0;
81
82 /*
83 * Take a reference for this transaction on the buf item.
84 */
85 atomic_inc(&bip->bli_refcount);
86
87 /*
88 * Get a log_item_desc to point at the new item.
89 */
90 (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
91
92 /*
93 * Initialize b_fsprivate2 so we can find it with incore_match()
94 * in xfs_trans_get_buf() and friends above.
95 */
96 XFS_BUF_SET_FSPRIVATE2(bp, tp);
97
98}
99
100void
101xfs_trans_bjoin(
102 struct xfs_trans *tp,
103 struct xfs_buf *bp)
104{
105 _xfs_trans_bjoin(tp, bp, 0);
106 trace_xfs_trans_bjoin(bp->b_fspriv);
107}
49 108
50/* 109/*
51 * Get and lock the buffer for the caller if it is not already 110 * Get and lock the buffer for the caller if it is not already
@@ -75,13 +134,14 @@ xfs_trans_get_buf(xfs_trans_t *tp,
75 xfs_buf_log_item_t *bip; 134 xfs_buf_log_item_t *bip;
76 135
77 if (flags == 0) 136 if (flags == 0)
78 flags = XFS_BUF_LOCK | XFS_BUF_MAPPED; 137 flags = XBF_LOCK | XBF_MAPPED;
79 138
80 /* 139 /*
81 * Default to a normal get_buf() call if the tp is NULL. 140 * Default to a normal get_buf() call if the tp is NULL.
82 */ 141 */
83 if (tp == NULL) 142 if (tp == NULL)
84 return xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY); 143 return xfs_buf_get(target_dev, blkno, len,
144 flags | XBF_DONT_BLOCK);
85 145
86 /* 146 /*
87 * If we find the buffer in the cache with this transaction 147 * If we find the buffer in the cache with this transaction
@@ -117,54 +177,22 @@ xfs_trans_get_buf(xfs_trans_t *tp,
117 } 177 }
118 178
119 /* 179 /*
120 * We always specify the BUF_BUSY flag within a transaction so 180 * We always specify the XBF_DONT_BLOCK flag within a transaction
121 * that get_buf does not try to push out a delayed write buffer 181 * so that get_buf does not try to push out a delayed write buffer
122 * which might cause another transaction to take place (if the 182 * which might cause another transaction to take place (if the
123 * buffer was delayed alloc). Such recursive transactions can 183 * buffer was delayed alloc). Such recursive transactions can
124 * easily deadlock with our current transaction as well as cause 184 * easily deadlock with our current transaction as well as cause
125 * us to run out of stack space. 185 * us to run out of stack space.
126 */ 186 */
127 bp = xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY); 187 bp = xfs_buf_get(target_dev, blkno, len, flags | XBF_DONT_BLOCK);
128 if (bp == NULL) { 188 if (bp == NULL) {
129 return NULL; 189 return NULL;
130 } 190 }
131 191
132 ASSERT(!XFS_BUF_GETERROR(bp)); 192 ASSERT(!XFS_BUF_GETERROR(bp));
133 193
134 /* 194 _xfs_trans_bjoin(tp, bp, 1);
135 * The xfs_buf_log_item pointer is stored in b_fsprivate. If 195 trace_xfs_trans_get_buf(bp->b_fspriv);
136 * it doesn't have one yet, then allocate one and initialize it.
137 * The checks to see if one is there are in xfs_buf_item_init().
138 */
139 xfs_buf_item_init(bp, tp->t_mountp);
140
141 /*
142 * Set the recursion count for the buffer within this transaction
143 * to 0.
144 */
145 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
146 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
147 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
148 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
149 bip->bli_recur = 0;
150
151 /*
152 * Take a reference for this transaction on the buf item.
153 */
154 atomic_inc(&bip->bli_refcount);
155
156 /*
157 * Get a log_item_desc to point at the new item.
158 */
159 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
160
161 /*
162 * Initialize b_fsprivate2 so we can find it with incore_match()
163 * above.
164 */
165 XFS_BUF_SET_FSPRIVATE2(bp, tp);
166
167 trace_xfs_trans_get_buf(bip);
168 return (bp); 196 return (bp);
169} 197}
170 198
@@ -209,44 +237,11 @@ xfs_trans_getsb(xfs_trans_t *tp,
209 } 237 }
210 238
211 bp = xfs_getsb(mp, flags); 239 bp = xfs_getsb(mp, flags);
212 if (bp == NULL) { 240 if (bp == NULL)
213 return NULL; 241 return NULL;
214 }
215
216 /*
217 * The xfs_buf_log_item pointer is stored in b_fsprivate. If
218 * it doesn't have one yet, then allocate one and initialize it.
219 * The checks to see if one is there are in xfs_buf_item_init().
220 */
221 xfs_buf_item_init(bp, mp);
222
223 /*
224 * Set the recursion count for the buffer within this transaction
225 * to 0.
226 */
227 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
228 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
229 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
230 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
231 bip->bli_recur = 0;
232
233 /*
234 * Take a reference for this transaction on the buf item.
235 */
236 atomic_inc(&bip->bli_refcount);
237
238 /*
239 * Get a log_item_desc to point at the new item.
240 */
241 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
242
243 /*
244 * Initialize b_fsprivate2 so we can find it with incore_match()
245 * above.
246 */
247 XFS_BUF_SET_FSPRIVATE2(bp, tp);
248 242
249 trace_xfs_trans_getsb(bip); 243 _xfs_trans_bjoin(tp, bp, 1);
244 trace_xfs_trans_getsb(bp->b_fspriv);
250 return (bp); 245 return (bp);
251} 246}
252 247
@@ -290,15 +285,15 @@ xfs_trans_read_buf(
290 int error; 285 int error;
291 286
292 if (flags == 0) 287 if (flags == 0)
293 flags = XFS_BUF_LOCK | XFS_BUF_MAPPED; 288 flags = XBF_LOCK | XBF_MAPPED;
294 289
295 /* 290 /*
296 * Default to a normal get_buf() call if the tp is NULL. 291 * Default to a normal get_buf() call if the tp is NULL.
297 */ 292 */
298 if (tp == NULL) { 293 if (tp == NULL) {
299 bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY); 294 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
300 if (!bp) 295 if (!bp)
301 return (flags & XFS_BUF_TRYLOCK) ? 296 return (flags & XBF_TRYLOCK) ?
302 EAGAIN : XFS_ERROR(ENOMEM); 297 EAGAIN : XFS_ERROR(ENOMEM);
303 298
304 if (XFS_BUF_GETERROR(bp) != 0) { 299 if (XFS_BUF_GETERROR(bp) != 0) {
@@ -385,14 +380,14 @@ xfs_trans_read_buf(
385 } 380 }
386 381
387 /* 382 /*
388 * We always specify the BUF_BUSY flag within a transaction so 383 * We always specify the XBF_DONT_BLOCK flag within a transaction
389 * that get_buf does not try to push out a delayed write buffer 384 * so that get_buf does not try to push out a delayed write buffer
390 * which might cause another transaction to take place (if the 385 * which might cause another transaction to take place (if the
391 * buffer was delayed alloc). Such recursive transactions can 386 * buffer was delayed alloc). Such recursive transactions can
392 * easily deadlock with our current transaction as well as cause 387 * easily deadlock with our current transaction as well as cause
393 * us to run out of stack space. 388 * us to run out of stack space.
394 */ 389 */
395 bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY); 390 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
396 if (bp == NULL) { 391 if (bp == NULL) {
397 *bpp = NULL; 392 *bpp = NULL;
398 return 0; 393 return 0;
@@ -424,40 +419,9 @@ xfs_trans_read_buf(
424 if (XFS_FORCED_SHUTDOWN(mp)) 419 if (XFS_FORCED_SHUTDOWN(mp))
425 goto shutdown_abort; 420 goto shutdown_abort;
426 421
427 /* 422 _xfs_trans_bjoin(tp, bp, 1);
428 * The xfs_buf_log_item pointer is stored in b_fsprivate. If 423 trace_xfs_trans_read_buf(bp->b_fspriv);
429 * it doesn't have one yet, then allocate one and initialize it.
430 * The checks to see if one is there are in xfs_buf_item_init().
431 */
432 xfs_buf_item_init(bp, tp->t_mountp);
433 424
434 /*
435 * Set the recursion count for the buffer within this transaction
436 * to 0.
437 */
438 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
439 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
440 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
441 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
442 bip->bli_recur = 0;
443
444 /*
445 * Take a reference for this transaction on the buf item.
446 */
447 atomic_inc(&bip->bli_refcount);
448
449 /*
450 * Get a log_item_desc to point at the new item.
451 */
452 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
453
454 /*
455 * Initialize b_fsprivate2 so we can find it with incore_match()
456 * above.
457 */
458 XFS_BUF_SET_FSPRIVATE2(bp, tp);
459
460 trace_xfs_trans_read_buf(bip);
461 *bpp = bp; 425 *bpp = bp;
462 return 0; 426 return 0;
463 427
@@ -472,8 +436,8 @@ shutdown_abort:
472 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) 436 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
473 cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp); 437 cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp);
474#endif 438#endif
475 ASSERT((XFS_BUF_BFLAGS(bp) & (XFS_B_STALE|XFS_B_DELWRI)) != 439 ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
476 (XFS_B_STALE|XFS_B_DELWRI)); 440 (XBF_STALE|XBF_DELWRI));
477 441
478 trace_xfs_trans_read_buf_shut(bp, _RET_IP_); 442 trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
479 xfs_buf_relse(bp); 443 xfs_buf_relse(bp);
@@ -622,53 +586,6 @@ xfs_trans_brelse(xfs_trans_t *tp,
622} 586}
623 587
624/* 588/*
625 * Add the locked buffer to the transaction.
626 * The buffer must be locked, and it cannot be associated with any
627 * transaction.
628 *
629 * If the buffer does not yet have a buf log item associated with it,
630 * then allocate one for it. Then add the buf item to the transaction.
631 */
632void
633xfs_trans_bjoin(xfs_trans_t *tp,
634 xfs_buf_t *bp)
635{
636 xfs_buf_log_item_t *bip;
637
638 ASSERT(XFS_BUF_ISBUSY(bp));
639 ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
640
641 /*
642 * The xfs_buf_log_item pointer is stored in b_fsprivate. If
643 * it doesn't have one yet, then allocate one and initialize it.
644 * The checks to see if one is there are in xfs_buf_item_init().
645 */
646 xfs_buf_item_init(bp, tp->t_mountp);
647 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
648 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
649 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
650 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
651
652 /*
653 * Take a reference for this transaction on the buf item.
654 */
655 atomic_inc(&bip->bli_refcount);
656
657 /*
658 * Get a log_item_desc to point at the new item.
659 */
660 (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
661
662 /*
663 * Initialize b_fsprivate2 so we can find it with incore_match()
664 * in xfs_trans_get_buf() and friends above.
665 */
666 XFS_BUF_SET_FSPRIVATE2(bp, tp);
667
668 trace_xfs_trans_bjoin(bip);
669}
670
671/*
672 * Mark the buffer as not needing to be unlocked when the buf item's 589 * Mark the buffer as not needing to be unlocked when the buf item's
673 * IOP_UNLOCK() routine is called. The buffer must already be locked 590 * IOP_UNLOCK() routine is called. The buffer must already be locked
674 * and associated with the given transaction. 591 * and associated with the given transaction.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index d725428c9df6..b09904555d07 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -151,8 +151,8 @@ typedef enum {
151} xfs_btnum_t; 151} xfs_btnum_t;
152 152
153struct xfs_name { 153struct xfs_name {
154 const char *name; 154 const unsigned char *name;
155 int len; 155 int len;
156}; 156};
157 157
158#endif /* __XFS_TYPES_H__ */ 158#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 6558ffd8d140..9d376be0ea38 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -70,7 +70,6 @@ xfs_setattr(
70 uint commit_flags=0; 70 uint commit_flags=0;
71 uid_t uid=0, iuid=0; 71 uid_t uid=0, iuid=0;
72 gid_t gid=0, igid=0; 72 gid_t gid=0, igid=0;
73 int timeflags = 0;
74 struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; 73 struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2;
75 int need_iolock = 1; 74 int need_iolock = 1;
76 75
@@ -135,16 +134,13 @@ xfs_setattr(
135 if (flags & XFS_ATTR_NOLOCK) 134 if (flags & XFS_ATTR_NOLOCK)
136 need_iolock = 0; 135 need_iolock = 0;
137 if (!(mask & ATTR_SIZE)) { 136 if (!(mask & ATTR_SIZE)) {
138 if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) || 137 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
139 (mp->m_flags & XFS_MOUNT_WSYNC)) { 138 commit_flags = 0;
140 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); 139 code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp),
141 commit_flags = 0; 140 0, 0, 0);
142 if ((code = xfs_trans_reserve(tp, 0, 141 if (code) {
143 XFS_ICHANGE_LOG_RES(mp), 0, 142 lock_flags = 0;
144 0, 0))) { 143 goto error_return;
145 lock_flags = 0;
146 goto error_return;
147 }
148 } 144 }
149 } else { 145 } else {
150 if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) && 146 if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
@@ -260,7 +256,7 @@ xfs_setattr(
260 iattr->ia_size > ip->i_d.di_size) { 256 iattr->ia_size > ip->i_d.di_size) {
261 code = xfs_flush_pages(ip, 257 code = xfs_flush_pages(ip,
262 ip->i_d.di_size, iattr->ia_size, 258 ip->i_d.di_size, iattr->ia_size,
263 XFS_B_ASYNC, FI_NONE); 259 XBF_ASYNC, FI_NONE);
264 } 260 }
265 261
266 /* wait for all I/O to complete */ 262 /* wait for all I/O to complete */
@@ -295,15 +291,23 @@ xfs_setattr(
295 * or we are explicitly asked to change it. This handles 291 * or we are explicitly asked to change it. This handles
296 * the semantic difference between truncate() and ftruncate() 292 * the semantic difference between truncate() and ftruncate()
297 * as implemented in the VFS. 293 * as implemented in the VFS.
294 *
295 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME
296 * is a special case where we need to update the times despite
297 * not having these flags set. For all other operations the
298 * VFS set these flags explicitly if it wants a timestamp
299 * update.
298 */ 300 */
299 if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME)) 301 if (iattr->ia_size != ip->i_size &&
300 timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 302 (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
303 iattr->ia_ctime = iattr->ia_mtime =
304 current_fs_time(inode->i_sb);
305 mask |= ATTR_CTIME | ATTR_MTIME;
306 }
301 307
302 if (iattr->ia_size > ip->i_size) { 308 if (iattr->ia_size > ip->i_size) {
303 ip->i_d.di_size = iattr->ia_size; 309 ip->i_d.di_size = iattr->ia_size;
304 ip->i_size = iattr->ia_size; 310 ip->i_size = iattr->ia_size;
305 if (!(flags & XFS_ATTR_DMI))
306 xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
307 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 311 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
308 } else if (iattr->ia_size <= ip->i_size || 312 } else if (iattr->ia_size <= ip->i_size ||
309 (iattr->ia_size == 0 && ip->i_d.di_nextents)) { 313 (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
@@ -374,9 +378,6 @@ xfs_setattr(
374 ip->i_d.di_gid = gid; 378 ip->i_d.di_gid = gid;
375 inode->i_gid = gid; 379 inode->i_gid = gid;
376 } 380 }
377
378 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
379 timeflags |= XFS_ICHGTIME_CHG;
380 } 381 }
381 382
382 /* 383 /*
@@ -393,51 +394,37 @@ xfs_setattr(
393 394
394 inode->i_mode &= S_IFMT; 395 inode->i_mode &= S_IFMT;
395 inode->i_mode |= mode & ~S_IFMT; 396 inode->i_mode |= mode & ~S_IFMT;
396
397 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
398 timeflags |= XFS_ICHGTIME_CHG;
399 } 397 }
400 398
401 /* 399 /*
402 * Change file access or modified times. 400 * Change file access or modified times.
403 */ 401 */
404 if (mask & (ATTR_ATIME|ATTR_MTIME)) { 402 if (mask & ATTR_ATIME) {
405 if (mask & ATTR_ATIME) { 403 inode->i_atime = iattr->ia_atime;
406 inode->i_atime = iattr->ia_atime; 404 ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
407 ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; 405 ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
408 ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; 406 ip->i_update_core = 1;
409 ip->i_update_core = 1;
410 }
411 if (mask & ATTR_MTIME) {
412 inode->i_mtime = iattr->ia_mtime;
413 ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
414 ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
415 timeflags &= ~XFS_ICHGTIME_MOD;
416 timeflags |= XFS_ICHGTIME_CHG;
417 }
418 if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)))
419 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
420 } 407 }
421 408 if (mask & ATTR_CTIME) {
422 /*
423 * Change file inode change time only if ATTR_CTIME set
424 * AND we have been called by a DMI function.
425 */
426
427 if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) {
428 inode->i_ctime = iattr->ia_ctime; 409 inode->i_ctime = iattr->ia_ctime;
429 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; 410 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
430 ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; 411 ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
431 ip->i_update_core = 1; 412 ip->i_update_core = 1;
432 timeflags &= ~XFS_ICHGTIME_CHG; 413 }
414 if (mask & ATTR_MTIME) {
415 inode->i_mtime = iattr->ia_mtime;
416 ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
417 ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
418 ip->i_update_core = 1;
433 } 419 }
434 420
435 /* 421 /*
436 * Send out timestamp changes that need to be set to the 422 * And finally, log the inode core if any attribute in it
437 * current time. Not done when called by a DMI function. 423 * has been changed.
438 */ 424 */
439 if (timeflags && !(flags & XFS_ATTR_DMI)) 425 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE|
440 xfs_ichgtime(ip, timeflags); 426 ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
427 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
441 428
442 XFS_STATS_INC(xs_ig_attrchg); 429 XFS_STATS_INC(xs_ig_attrchg);
443 430
@@ -452,12 +439,10 @@ xfs_setattr(
452 * mix so this probably isn't worth the trouble to optimize. 439 * mix so this probably isn't worth the trouble to optimize.
453 */ 440 */
454 code = 0; 441 code = 0;
455 if (tp) { 442 if (mp->m_flags & XFS_MOUNT_WSYNC)
456 if (mp->m_flags & XFS_MOUNT_WSYNC) 443 xfs_trans_set_sync(tp);
457 xfs_trans_set_sync(tp);
458 444
459 code = xfs_trans_commit(tp, commit_flags); 445 code = xfs_trans_commit(tp, commit_flags);
460 }
461 446
462 xfs_iunlock(ip, lock_flags); 447 xfs_iunlock(ip, lock_flags);
463 448
@@ -599,116 +584,6 @@ xfs_readlink(
599} 584}
600 585
601/* 586/*
602 * xfs_fsync
603 *
604 * This is called to sync the inode and its data out to disk. We need to hold
605 * the I/O lock while flushing the data, and the inode lock while flushing the
606 * inode. The inode lock CANNOT be held while flushing the data, so acquire
607 * after we're done with that.
608 */
609int
610xfs_fsync(
611 xfs_inode_t *ip)
612{
613 xfs_trans_t *tp;
614 int error = 0;
615 int log_flushed = 0, changed = 1;
616
617 xfs_itrace_entry(ip);
618
619 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
620 return XFS_ERROR(EIO);
621
622 /*
623 * We always need to make sure that the required inode state is safe on
624 * disk. The inode might be clean but we still might need to force the
625 * log because of committed transactions that haven't hit the disk yet.
626 * Likewise, there could be unflushed non-transactional changes to the
627 * inode core that have to go to disk and this requires us to issue
628 * a synchronous transaction to capture these changes correctly.
629 *
630 * This code relies on the assumption that if the update_* fields
631 * of the inode are clear and the inode is unpinned then it is clean
632 * and no action is required.
633 */
634 xfs_ilock(ip, XFS_ILOCK_SHARED);
635
636 if (!ip->i_update_core) {
637 /*
638 * Timestamps/size haven't changed since last inode flush or
639 * inode transaction commit. That means either nothing got
640 * written or a transaction committed which caught the updates.
641 * If the latter happened and the transaction hasn't hit the
642 * disk yet, the inode will be still be pinned. If it is,
643 * force the log.
644 */
645
646 xfs_iunlock(ip, XFS_ILOCK_SHARED);
647
648 if (xfs_ipincount(ip)) {
649 error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
650 XFS_LOG_FORCE | XFS_LOG_SYNC,
651 &log_flushed);
652 } else {
653 /*
654 * If the inode is not pinned and nothing has changed
655 * we don't need to flush the cache.
656 */
657 changed = 0;
658 }
659 } else {
660 /*
661 * Kick off a transaction to log the inode core to get the
662 * updates. The sync transaction will also force the log.
663 */
664 xfs_iunlock(ip, XFS_ILOCK_SHARED);
665 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
666 error = xfs_trans_reserve(tp, 0,
667 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
668 if (error) {
669 xfs_trans_cancel(tp, 0);
670 return error;
671 }
672 xfs_ilock(ip, XFS_ILOCK_EXCL);
673
674 /*
675 * Note - it's possible that we might have pushed ourselves out
676 * of the way during trans_reserve which would flush the inode.
677 * But there's no guarantee that the inode buffer has actually
678 * gone out yet (it's delwri). Plus the buffer could be pinned
679 * anyway if it's part of an inode in another recent
680 * transaction. So we play it safe and fire off the
681 * transaction anyway.
682 */
683 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
684 xfs_trans_ihold(tp, ip);
685 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
686 xfs_trans_set_sync(tp);
687 error = _xfs_trans_commit(tp, 0, &log_flushed);
688
689 xfs_iunlock(ip, XFS_ILOCK_EXCL);
690 }
691
692 if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
693 /*
694 * If the log write didn't issue an ordered tag we need
695 * to flush the disk cache for the data device now.
696 */
697 if (!log_flushed)
698 xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
699
700 /*
701 * If this inode is on the RT dev we need to flush that
702 * cache as well.
703 */
704 if (XFS_IS_REALTIME_INODE(ip))
705 xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
706 }
707
708 return error;
709}
710
711/*
712 * Flags for xfs_free_eofblocks 587 * Flags for xfs_free_eofblocks
713 */ 588 */
714#define XFS_FREE_EOF_TRYLOCK (1<<0) 589#define XFS_FREE_EOF_TRYLOCK (1<<0)
@@ -1111,7 +986,7 @@ xfs_release(
1111 */ 986 */
1112 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 987 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1113 if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) 988 if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
1114 xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE); 989 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
1115 } 990 }
1116 991
1117 if (ip->i_d.di_nlink != 0) { 992 if (ip->i_d.di_nlink != 0) {
@@ -2214,7 +2089,8 @@ xfs_symlink(
2214 if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) { 2089 if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
2215 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp, 2090 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
2216 DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, 2091 DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2217 link_name->name, target_path, 0, 0, 0); 2092 link_name->name,
2093 (unsigned char *)target_path, 0, 0, 0);
2218 if (error) 2094 if (error)
2219 return error; 2095 return error;
2220 } 2096 }
@@ -2410,7 +2286,8 @@ std_return:
2410 dp, DM_RIGHT_NULL, 2286 dp, DM_RIGHT_NULL,
2411 error ? NULL : ip, 2287 error ? NULL : ip,
2412 DM_RIGHT_NULL, link_name->name, 2288 DM_RIGHT_NULL, link_name->name,
2413 target_path, 0, error, 0); 2289 (unsigned char *)target_path,
2290 0, error, 0);
2414 } 2291 }
2415 2292
2416 if (!error) 2293 if (!error)
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 167a467403a5..d8dfa8d0dadd 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -21,7 +21,6 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
21#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ 21#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */
22 22
23int xfs_readlink(struct xfs_inode *ip, char *link); 23int xfs_readlink(struct xfs_inode *ip, char *link);
24int xfs_fsync(struct xfs_inode *ip);
25int xfs_release(struct xfs_inode *ip); 24int xfs_release(struct xfs_inode *ip);
26int xfs_inactive(struct xfs_inode *ip); 25int xfs_inactive(struct xfs_inode *ip);
27int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, 26int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
@@ -43,25 +42,13 @@ int xfs_change_file_space(struct xfs_inode *ip, int cmd,
43int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, 42int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
44 struct xfs_inode *src_ip, struct xfs_inode *target_dp, 43 struct xfs_inode *src_ip, struct xfs_inode *target_dp,
45 struct xfs_name *target_name, struct xfs_inode *target_ip); 44 struct xfs_name *target_name, struct xfs_inode *target_ip);
46int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value, 45int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
47 int *valuelenp, int flags); 46 unsigned char *value, int *valuelenp, int flags);
48int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value, 47int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
49 int valuelen, int flags); 48 unsigned char *value, int valuelen, int flags);
50int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags); 49int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
51int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, 50int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
52 int flags, struct attrlist_cursor_kern *cursor); 51 int flags, struct attrlist_cursor_kern *cursor);
53ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,
54 const struct iovec *iovp, unsigned int segs,
55 loff_t *offset, int ioflags);
56ssize_t xfs_splice_read(struct xfs_inode *ip, struct file *infilp,
57 loff_t *ppos, struct pipe_inode_info *pipe, size_t count,
58 int flags, int ioflags);
59ssize_t xfs_splice_write(struct xfs_inode *ip,
60 struct pipe_inode_info *pipe, struct file *outfilp,
61 loff_t *ppos, size_t count, int flags, int ioflags);
62ssize_t xfs_write(struct xfs_inode *xip, struct kiocb *iocb,
63 const struct iovec *iovp, unsigned int nsegs,
64 loff_t *offset, int ioflags);
65int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, 52int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
66 int flags, struct xfs_iomap *iomapp, int *niomaps); 53 int flags, struct xfs_iomap *iomapp, int *niomaps);
67void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first, 54void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
@@ -72,4 +59,6 @@ int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
72 xfs_off_t last, uint64_t flags, int fiopt); 59 xfs_off_t last, uint64_t flags, int fiopt);
73int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last); 60int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
74 61
62int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
63
75#endif /* _XFS_VNODEOPS_H */ 64#endif /* _XFS_VNODEOPS_H */